Codebase list fuzzyocr / run/fb89a21a-9255-4244-a1b2-4c9d1b96bec2/main FuzzyOcr.cf
run/fb89a21a-9255-4244-a1b2-4c9d1b96bec2/main

Tree @run/fb89a21a-9255-4244-a1b2-4c9d1b96bec2/main (Download .tar.gz)

FuzzyOcr.cf @run/fb89a21a-9255-4244-a1b2-4c9d1b96bec2/mainraw · history · blame

# Syntax:
# loadplugin <Plugin_Name> <Location>
#  <Location> path where Plugin resides.
loadplugin FuzzyOcr FuzzyOcr.pm

body     FUZZY_OCR                   eval:fuzzyocr_check()
body     FUZZY_OCR_WRONG_CTYPE       eval:dummy_check()
body     FUZZY_OCR_CORRUPT_IMG       eval:dummy_check()
body     FUZZY_OCR_WRONG_EXTENSION   eval:dummy_check()
body     FUZZY_OCR_KNOWN_HASH        eval:dummy_check()

describe FUZZY_OCR                   Mail contains an image with common spam text inside
describe FUZZY_OCR_WRONG_CTYPE       Mail contains an image with wrong content-type set
describe FUZZY_OCR_WRONG_EXTENSION   Mail contains an image with wrong file extension
describe FUZZY_OCR_CORRUPT_IMG       Mail contains a corrupted image
describe FUZZY_OCR_KNOWN_HASH        Mail contains an image with known hash

priority FUZZY_OCR 900

###
### Plugin Configuration 
###

###
### Logging options
###

# Verbosity level (see manual)
# Level 0 - Errors only
# Level 1 - Errors and Warnings
# Level 2 - Errors, Warnings and Info Messages
# Level 3 - Full debug output
# Default value: 1
#focr_verbose 3

# Log Message-Id, From, To
# Default: 1
#focr_log_pmsinfo 0

# Send logging output to stderr.
# Default value: 1
#focr_log_stderr 0

# Logfile (make sure it is writable by the plugin) 
# Default value: none
#focr_logfile /tmp/FuzzyOcr.log

###
### Wordlists 
###

# Here we defined the words to scan for
# Default value: /etc/mail/spamassassin/FuzzyOcr.words
#focr_global_wordlist /etc/mail/spamassassin/FuzzyOcr.words
#
# This is the path RELATIVE to the respective home directory
# for the personalized list. This list is merged with the global 
# word list on execution.
# Default value: ~/.spamassassin/fuzzyocr.words 
# If value begins with '/', it is treated as fixed path.
#focr_personal_wordlist fuzzyocr.words
#
# This option allows you to disable the whole personalization stuff,
# i.e. FuzzyOcr will not call functions in SA that require home
# directories for your users. This is only required if you are running
# an environment where the users don't have home directories at all.
# Default value: 0
#
#focr_no_homedirs 1
#
## Optionally, disable this option if you want to scan for numbers
## Setting this to 0 will cause FuzzyOcr not to strip numbers from
## both the wordlist and the OCR results
#
#focr_strip_numbers 1


###
### Helper Applications
###

# These parameters can be used to change other detection settings
# If you leave these commented out, the defaults will be used.
# Do not use " " around any parameters!

###
### Step 1:
### Inform the plugin which helper apps are required.
###

# The following are already included by default:
#
#focr_bin_helper gifsicle, giffix, giftext, gifinter, giftopnm
#focr_bin_helper jpegtopnm, pngtopnm, bmptopnm, tifftopnm, ppmhist
#focr_bin_helper gocr, ocrad

# Include additional scanner/preprocessor commands here:
#
focr_bin_helper pnmnorm, pnminvert, pamthreshold, ppmtopgm, pamtopnm
focr_bin_helper tesseract

# These helpers must be defined before enabling PDF scanning
#focr_bin_helper pdfinfo, pdftops, pstopnm

###
### Step 2:
### Inform the plugin of the search path to find all helper apps.
### Only the first match will be considered, so the order is important.
###

# Search path for locating helper applications
#focr_path_bin /usr/local/netpbm/bin:/usr/local/bin:/usr/bin

###
### Step 3:
### You can optionally define a helper application location, bypassing
### the search path algorithm. Please note that if the helper app is not
### previously defined, it will generate an error:

#focr_bin_gifsicle /usr/bin/gifsicle
#focr_bin_giffix /usr/bin/giffix
#focr_bin_giftext /usr/bin/giftext
#focr_bin_gifinter /usr/bin/gifinter
#focr_bin_giftopnm /usr/bin/giftopnm
#focr_bin_jpegtopnm /usr/bin/jpegtopnm
#focr_bin_pngtopnm /usr/bin/pngtopnm
#focr_bin_bmptopnm /usr/bin/bmptopnm
#focr_bin_tifftopnm /usr/bin/tifftopnm
#focr_bin_ppmhist /usr/bin/ppmhist
#focr_bin_gocr /usr/bin/gocr
#focr_bin_ocrad /usr/bin/ocrad

#focr_bin_pnmnorm /usr/bin/pnmnorm
#focr_bin_pnminvert /usr/bin/pnminvert

#focr_bin_pdfinfo /usr/bin/pdfinfo
#focr_bin_pdftops /usr/bin/pdftops
#focr_bin_pstopnm /usr/bin/pstopnm

###
### Scansets 
###

# Paths to the files containing Scansets and Preprocessors definitions
#
#focr_preprocessor_file /etc/mail/spamassassin/FuzzyOcr.preps
#focr_scanset_file /etc/mail/spamassassin/FuzzyOcr.scansets

# Setting this to 1 will cause FuzzyOcr to skip all other scansets,
# if a scanset has reached the amount of hits specified in 
# focr_counts_required. (i.e. if the image is detected as spam).
# This saves resources, but lowers the scores because not the best, 
# but the first best scanset is taken as result.
# Default value: 1
#focr_minimal_scanset 0

# This option is only used when focr_minimal_scanset is enabled. 
# Basically, this counts the effectiveness of a scanset on the current 
# mail traffic and resorts the scansets with the most effective first.
# This saves unnecessary scanner passes and saves resources. 
# Default value: 1.
#focr_autosort_scanset 0

# This is a parameter for the focr_autosort_scanset function, and specifies
# the maximum value of the effectiveness counter used in each scanset. If you
# increase this, it will take longer until the autosort function adapts to new
# types of spam, setting it too low will lower the effectiveness of the 
# function. 
# Default value: 10
#focr_autosort_buffer 10

###
### Scan Settings
###

# Timeout for the plugin, in seconds. (Maximum runtime of the plugin)
# Default value: 10
#focr_timeout 15

# Use a global timeout value instead of per helper application.
# Default value: 0
#focr_global_timeout 1

# Minimum image size to scan. Images with dimensions smaller than the
# ones specified here will be skipped:
# (This parameter does not apply to PDF files)
# Default: Height:4 Width:4
#
#focr_min_height 4
#focr_min_width 4

# Maximum image size to scan. Images with dimensions bigger than the
# ones specified here will be skipped:
# (This parameter does not apply to PDF files)
# Default: Height:800 Width:800
#
#focr_max_height 800
#focr_max_width 800


# Maximum file size for different formats in byte, bigger pictures 
# will not be scanned 
# Default values: Unlimited)
#focr_max_size_gif 80000
#focr_max_size_jpeg 100000
#focr_max_size_png 80000
#focr_max_size_bmp 500000
#focr_max_size_tiff 500000

# Skip checking the following image types 
# Default value: 0 (check image type)
#focr_skip_gif 1
#focr_skip_jpeg 1
#focr_skip_png 1
#focr_skip_bmp 1
#focr_skip_tiff 1
#

# PDF specific options
# WARNING: Enable this at your own risk, this might lead to false positives and classify
#          important documents as spam. YOU HAVE BEEN WARNED.
#focr_scan_pdfs 0
# PDFs having more pages than this value will be skipped
#focr_pdf_maxpages 1

# Default detection treshold (see manual) 
# Default value: 0.25 (Can be changed on a per word basis in the wordlist).
#focr_threshold 0.20

# Number of minimum matches before the rule scores (Default value: 2)
#focr_counts_required 3

# Setting this will cause every word to be matched only once per image (Default value: 0)
#focr_unique_matches 1

# This is the score for a hit after focr_counts_required matches
# Default value: 5
#focr_base_score 5

# This is the additional score for every additional match after 
# focr_counts_required matches
# Default value: 1
#focr_add_score 0.375

# This option defines the factor, which is multiplied with the number
# of matches, that were made without stripping spaces. FuzzyOcr does two
# matching attempts on OCR results, one without space strippings and one with.
# To weight the first match type more, this factor is applied.
# Default value: 1.5
#focr_twopass_scoring_factor 1.5

# This is the score to give for a wrong content-type.
# e.g. JPEG image but content type says GIF
# Default value: 1.5
#focr_wrongctype_score 1.5

# This is the score to give for a wrong file extension.
# e.g. JPEG image but file extension says GIF
# Default value: 1.5
#focr_wrongext_score 1.5

# This is the score to give for a corrupted image.
# This currently affects only GIF images
# Default value: 2.5
#focr_corrupt_score 2.5

# This is the score to give for a corrupted unfixable image.
# This currently affects only GIF images.
# Default value: 5
#focr_corrupt_unfixable_score 5

# This is used to disable the OCR engine if the message has 
# already more points than this value 
# Default value: 10
#focr_autodisable_score 30

# This is used to disable the OCR engine if the message has
# already less points than this value 
# Default value: -5
#focr_autodisable_negative_score -5


###
### Hashing Options (Optional)
###

# Select which type of image hashing to use:
# Default value: 0 (disabled)
# Allowed values:
#  1 ... use digest_hash only (deprecated)
#  2 ... use digest_db w/digest_hash import (see requirements, recommended)
#  3 ... use mysql database (see requirements, experimental)
#--
# The score is saved with the hash in the database, allowing the plugin to
# skip the scans when the image is found in the database, using the score
# from the previous scans.
#--
#focr_enable_image_hashing 3

# Set this to skip updating the hashing database at startup
# Default value: 0 (update at startup)
#focr_skip_updates 1

# Automatically add hashes of spam images recognized by OCR to the Image 
# Hash database, to disable, set to 0
# Default value: 1 (learn)
#focr_hashing_learn_scanned 1

# Score images who's global word count is below focr_counts_required using 
# the following formulae: (focr_add_score * word count) as score.
# Default value: 0 (ignore images)
#focr_score_ham 1

# If the image hash database feature is enabled (Type 1 Hashing),
# specify the file to use as database
# Default value: /etc/mail/spamassassin/FuzzyOcr.hashdb
#focr_digest_db /etc/mail/spamassassin/FuzzyOcr.hashdb

# If the image hash db feature is enabled (Type 2 Hashing),
# specify the file to use as the SPAM database
# Default value: /etc/mail/spamassassin/FuzzyOcr.db
#focr_db_hash /etc/mail/spamassassin/FuzzyOcr.db

# If the image hash db feature is enabled (Type 2 Hashing), 
# specify the file to use as the HAM database
# Default value: /etc/mail/spamassassin/FuzzyOcr.safe.db
#focr_db_safe /etc/mail/spamassassin/FuzzyOcr.safe.db

# Auto-prune: Expire records from hasing databases after these many days
# Default value: 35
#focr_db_max_days 15

###
### MySQL options (Type 3 Hashing)
###

#focr_mysql_db FuzzyOcr
#focr_mysql_hash Hash
#focr_mysql_safe Safe
#focr_mysql_user fuzzyocr
#focr_mysql_pass fuzzyocr
#focr_mysql_host localhost
#focr_mysql_port 3306
#focr_mysql_socket /tmp/mysql.sock

# If set, the database table is updated with different data from one of
# the following:
#  + filename, 
#  + image-params,
#  + content-type, 
#  + file-type, 
#  + score, 
#  + word-info
# Default value: 0
#focr_mysql_update_hash 1

###
### Miscellaneous Options
###

# The pluging uses a temporary directory to store intermediate information.
# In order to Keep these files for debugging purposes use any of these
# values:
#  0 = always cleanup (default value)
#  1 = keep only if error
#  2 = always keep
#--
# Keeping these intermediate files could fill your HDD _very_ fast!
# Make shure you periodically empty your temp dir (usually: /tmp) or
# suffer the conscecuences.  You've been warned!!
#--
#focr_keep_bad_images 1

#################################################################
# DO NOT REMOVE THIS LINE, IT IS REQUIRED UNDER ALL CIRCUMSTANCES
focr_end_config