#!/usr/bin/perl -w # spamsort : a backend script to tune spamassassin using human flagged spam and ham (non-spam) messages. Written by: # Marc Swanson # M. Swanson Consulting # mswanson_NO_SPAM_@mswanson.com # http://www.mswanson.com #This code is designed to check two sets of email boxes: one flagged for spam, one flagged for ham (not spam). #new messages are checked via imap off of the server and parsed to appear as the original message (prior to forwarding) #before being actually passed to sa-learn. The typical use of this script is to setup a special mail user #(spam_sort for instance) and alias is_spam@yourdomain.com and not_spam@yourdomain.com. Then, #simply forward spam messages to is_spam@yourdomain.com and ham messages to not_spam@yourdomain.com, put this script #into cron for nightly processing, and your bayes db will be automatically updated. #IMPORTANT: be sure that you have two folders at the root of your imap account: is_spam and not_spam # (or whatever you set spam_account and ham_account to) #Tune the variables below to fit your implementation use Mail::IMAPClient; use Getopt::Long qw(:config no_ignore_case); use Pod::Usage; use strict; my %options = ( man => '0', help => '0', mail_host => '127.0.0.1', mail_user => 'spam_account', mail_password => 'password', inbox_folder => 'INBOX', bayes_dir => '~/.spamassassin', log_file => '', spam_address => 'is_spam', ham_address => 'not_spam', spam_folder => 'is_spam', ham_folder => 'not_spam', restrict_from => '', rm_filtered_msgs => '1', rm_inbox_msgs => '1', filtered_unread_msgs_only => '0', inbox_unread_only => '0', verbose => '0' ); # Read in the options GetOptions( "H|help|?" => \$options{help}, man => \$options{man}, "mail_host|h=s" => \$options{mail_host}, "mail_user|u=s" => \$options{mail_user}, "mail_password|p=s" => \$options{mail_password}, "inbox_folder|i=s" => \$options{inbox_folder}, "bayes_dir|b=s" => \$options{bayes_dir}, "log_file|l:s" => \$options{log_file}, "spam_address|a=s" => \$options{spam_address}, "ham_address|A=s" => \$options{ham_address}, "spam_folder|f=s" => \$options{spam_folder}, "ham_folder|F=s" => \$options{ham_folder}, "restrict_from|r:s" => \$options{restrict_from}, "rm_filtered_msgs|d!" => \$options{rm_filtered_msgs}, "rm_inbox_msgs|D!" => \$options{rm_inbox_msgs}, "filtered_unread_msgs_only|m!" => \$options{filtered_unread_msgs_only}, "inbox_unread_msgs_only|M!" => \$options{inbox_unread_msgs_only}, "verbose|v+" => \$options{verbose}, ) or pod2usage(2); pod2usage(1) if $options{help}; pod2usage(-exitstatus => 0, -verbose => 2) if $options{man}; my $msg_tmp = '/tmp/sa_msg_tmp.txt'; #declare globals my $inbox_count = 0; my $spam_count = 0; my $ham_count = 0; my $start_time = time(); if($options{log_file}) { # open log and make sure it is unbuffered. From now on all prints # will print to the log unless specified otherwise. open LOG, ">>$options{log_file}" or stop_program("Can't open $options{log_file} for write: $!\n"); select LOG; $| = 1; } my $imap = Mail::IMAPClient->new; $imap = Mail::IMAPClient->new( Server => $options{mail_host}, User => $options{mail_user}, Password => $options{mail_password} ) or stop_program("Cannot connect to $options{mail_host} as $options{mail_user}: $@"); if($options{verbose}) { print "Successfully established IMAP connection\n"; } #make it so that reading the message doesn't automatically mark it as read { #FALSE is a bareword that ticks off some perl versions. no strict; $imap->Peek(FALSE); } #make sure that the destination folders exist, and any other sanity checks return ok sanity_checks(); #figure out the perms on the bayes files before we start. my @bayes_fstats = stat("$options{bayes_dir}/bayes_toks"); if($options{verbose}) { print "Detected permissions on bayes toks as uid: " . $bayes_fstats[4] . ", gid: " . $bayes_fstats[5] . ", perms: " . $bayes_fstats[2] . "\n"; } #process the inbox process_inbox(); #now, process any messages marked as 'new' in the is_spam and not_spam folders process_messages('spam'); process_messages('ham'); #finally, rebuild the bayes database if($options{verbose}) { print "Rebuilding bayes database...\n"; } system("sa-learn --rebuild"); #fix the permissions on spamassassin files to be owned by the original owner, with original perms if($bayes_fstats[4]) { my $command = "chown $bayes_fstats[4]"; if($bayes_fstats[5]) { $command .= ":$bayes_fstats[5]"; } $command .= " $options{bayes_dir}/bayes_*"; if($options{verbose} > 1) { print "Preserving ownership on bayes files with command: $command\n"; } system($command); } #finish up my $stop_time = time(); print "Done! Processed $inbox_count new messages in inbox, $spam_count new spam messages, and $ham_count non-spam messages in " . ($stop_time - $start_time) . " second(s).\n"; stop_program(""); ####################################### #just subroutines below here sub process_inbox { print "Processing inbox: ", scalar localtime time, "\n"; #select the INBOX folder $imap->select($options{inbox_folder}) or stop_program("Could not select: $@\n"); my @msgs; if($options{inbox_unread_msgs_only}) { #look for unseen messages @msgs = $imap->unseen or print "No unseen messages in inbox: $@\n"; } else { @msgs = $imap->messages or print "No messages in inbox folder: $@\n"; } #now, loop over the unseen messages.. foreach my $msg_id (@msgs) { $inbox_count++; system("rm -f $msg_tmp"); my $type = ''; my $new_msg_string = ''; my $mime_boundary = ''; my $sub_mime_boundary = ''; my $mime_section = 0; my $fwd_type = ''; my $msg_from = $imap->get_header($msg_id, "From"); my $msg_to = $imap->get_header($msg_id, "To"); my $msg_subject = $imap->get_header($msg_id, "Subject"); my $msg_ident = "Msgid: $msg_id, subject: $msg_subject"; if($options{verbose}) { print "Processing unseen message: [$msg_ident]\n"; } if($options{restrict_from} && $msg_from !~ /^[<\'"]*$options{restrict_from}$/) { print "Warning: Inbox message [$msg_ident] does not match allowable from regex. Skipping.\n"; $imap->see(($msg_id)) or stop_program("Could not see message [$msg_ident]: $@\n"); next; } if($msg_to =~ /^[<'"]*$options{spam_address}\@\S+\.\S{2,3}/i) { $type = 'spam'; print "Processing inbox message [$msg_ident], type is $type\n"; } elsif($msg_to =~ /^[<'"]*$options{ham_address}\@\S+\.\S{2,3}/i) { $type = 'ham'; print "Processing inbox message [$msg_ident], type is $type\n"; } else { print "Warning: Inbox message [$msg_ident] spam/ham status could not be determined from to header: [$msg_to]. Skipping.\n"; $imap->see(scalar ($msg_id)) or stop_program("Could not see message [$msg_ident]: $@\n"); next; } #to avoid big memory usage, save the message as a file, then read it a line at a time if(!$imap->message_to_file($msg_tmp,scalar ($msg_id))) { print "Could not message_to_file on message [$msg_ident]: $@\n"; next; } #now, figure out what we have been given #grep through the headers to find the 'to' string open MSG, $msg_tmp or stop_program("could not read $msg_tmp for read: $!\n"); while() { if($mime_boundary eq "" && /^Content-Type:\s+multipart\/mixed;\s*boundary=["']{0,1}(.*?)["']{0,1}\s*$/i) { $mime_boundary = quotemeta($1); if($options{verbose} > 1) { print "Process_inbox: detected master mime boundary: $mime_boundary\n"; } next; } if($mime_boundary ne "" && $_ =~ /^\s*--$mime_boundary\s*/) { $mime_section++; if($options{verbose} > 1) { print "Process_inbox: detected new mime section. mime index: $mime_section\n"; } } if($mime_section == 2) { #this should be the forwarded bit #the tricky part is we need to handle single and multiple forwards if($options{verbose} > 1 && !$fwd_type) { print "Process_inbox: reached mime section 2, have not yet determined message type\n"; } if(!$fwd_type && /^Content-Type:\s+multipart\/digest;\s*boundary=["']{0,1}(.*?)["']{0,1}\s*$/i) { $fwd_type = 'multi'; $sub_mime_boundary = quotemeta($1); if($options{verbose} > 1) { print "Process_inbox: detected multiple-forwards type message, sub mime boundary: $sub_mime_boundary\n"; } last; } elsif(!$fwd_type && /^Content-Type:/i) { $fwd_type = 'single'; if($options{verbose} > 1) { print "Process_inbox: message is of type single forward or original copy\n"; } last; } } } if($fwd_type eq 'single') { #go until we hit a space, followed by non-space find_lines('^\s*$',0); $new_msg_string = find_lines('^\S+:',0); $new_msg_string .= find_lines("^\\s*--$mime_boundary--\\s*\$",-1); if($options{verbose} > 1) { print "message is of class single\n"; } create_message($type,$new_msg_string); } elsif($fwd_type eq 'multi' && $sub_mime_boundary) { #in this case, there is an indeterminate number of messages that we need to get start and stop lines for if($options{verbose} > 1) { print "message is of class multi\n"; } find_lines("^\\s*--$sub_mime_boundary\\s*\$",0); while() { #go until we hit a space, followed by non-space find_lines('^\s*$',0); $new_msg_string = find_lines('^\S+:',0); my $line_no = $.; my $rest = find_lines("^\\s*--$sub_mime_boundary\\s*\$",-1); if($rest eq "") { #this must be the last message in the forward list #we need to rewind the fh to the last line number #easiest way is to re-open, and drill down to the line number close(MSG); open MSG, $msg_tmp or stop_program("could not re-open $msg_tmp for read: $!\n"); do { my $crap = } until $. == $line_no || eof; $rest = find_lines("^\\s*--$sub_mime_boundary--\\s*\$",-1); } if($rest ne "") { #now we should have the complete msg. save this as a new message in the appropriate folder $new_msg_string .= $rest; create_message($type,$new_msg_string); } } } else { #assume we literally want the whole message if($options{verbose} > 1) { print "Assuming message should be treated as an original for [$msg_ident].\n"; } create_message($type,$imap->message_string($msg_id)); } if($options{delete_inbox_msgs}) { #mark the message as read $imap->delete_message(scalar ($msg_id)) or stop_program("Could not delete message [$msg_ident]: $@\n"); print "Deleted inbox message [$msg_ident].\n"; } else { #mark the message as read $imap->see(scalar ($msg_id)) or stop_program("Could not see message [$msg_ident]: $@\n"); print "Marked inbox message [$msg_ident] as read.\n"; } } #close the inbox folder $imap->close() or stop_program("could not close the inbox: $!\n"); if($options{verbose}) { print "Closed the inbox folder.\n"; } } sub process_messages { my $type = shift; my $src_folder; if($type eq 'spam') { $src_folder = 'is_spam'; } elsif($type eq 'ham') { $src_folder = 'not_spam'; } else { print "Warning, asked to process an unknown type: $type\n"; return; } print "Processing filtered message folder: $src_folder\n"; #select the indicated folder $imap->select($src_folder) or stop_program("Could not select $src_folder: $@\n"); #look for unseen messages my @msgs; if($options{filtered_unread_msgs_only}) { @msgs = $imap->unseen or print "No unseen messages in $src_folder: $@\n"; } else { @msgs = $imap->messages or print "No messages in $src_folder: $@\n"; } foreach my $msg_id (@msgs) { my $msg_subject = $imap->get_header($msg_id, "Subject"); my $msg_ident = "Msgid: $msg_id, subject: $msg_subject"; if($options{verbose}) { print "Processing filtered message [$msg_ident] of type $type\n"; } #save this message to a temp file if(!$imap->message_to_file($msg_tmp,scalar ($msg_id))) { print "Could not message_to_file on $msg_id: $@\n"; return; } #now run sa-learn on this message learn_message($type,$msg_tmp); if($options{rm_filtered_msgs}) { $imap->delete_message(scalar ($msg_id)) or stop_program("Could not delete message: $@\n"); if($options{verbose}) { print "Deleted filtered message [$msg_ident]\n"; } } else { #and mark the message as read $imap->see(scalar ($msg_id)) or stop_program("Could not see: $@\n"); if($options{verbose}) { print "Marked filtered message [$msg_ident] as read\n"; } } if($type eq 'spam') { $spam_count++; } elsif($type eq 'ham') { $ham_count++; } } #close the folder (performs an implicit expunge) $imap->close() or stop_program("could not close the $src_folder folder: $!\n"); if($options{verbose}) { print "Closed folder: $src_folder\n"; } } sub create_message { my $type = shift; my $msg_string = shift; if(!$msg_string) { print "Warning: cannot create message. Empty contents.\n"; return; } if($options{verbose} > 1) { print "create_message: type = $type\n"; print "\tMessage content preview:[" . substr($msg_string,0,100) . "]\n"; } my $dst_folder; #ok, now we append this message to the appropriate folder depending on message type, and mark the original as read if($type eq 'spam') { $dst_folder = $options{spam_folder}; } elsif($type eq 'ham') { $dst_folder = $options{ham_folder}; } if($dst_folder && $msg_string ne "") { #strip leading and trailing space $msg_string =~ s/^\s*//; $msg_string =~ s/\s*$//; #check to make sure that there is at least a blank line between headers and body (only happens with null body) if(!($msg_string =~ /^\s*$/m)) { print "Warning: Appending a blank line to message, body was null.\n"; $msg_string .= "\n\n"; } my $new_msg_id = $imap->append($dst_folder,$msg_string) or print "Could not append, msg_string was $msg_string: $@\n"; if($options{verbose} > 1) { print "Created new message in folder $dst_folder with id $new_msg_id.\n"; } return $new_msg_id; } return 0; } sub learn_message { my $type = shift; my $msg_file = shift; if($options{verbose}) { print "Learn_message, type = $type, msg_file = $msg_file\n"; } my $command = "sa-learn --no-rebuild --$type --file $msg_file 1>/dev/null 2>&1"; system($command); if($options{verbose}) { print "Ran sa-learn command: $command\n"; } #delete the temp file system("rm -f $msg_file"); } sub find_lines { my $pattern = shift; my $mode = shift; if($options{verbose} > 1) { print "Find_lines, pattern = $pattern, mode = $mode\n"; } my $str = ""; if(/$pattern/) { if($options{verbose} > 1) { print "Find_lines, pattern found on current line: $_\n"; } if($mode == 0) { return $_; } return $str; } while() { if(/$pattern/) { if($options{verbose} > 1) { print "Find_lines, pattern found: $_\n"; } if($mode == 0) { return $_; } elsif($mode == -1) { return $str; } } $str .= $_; } if($options{verbose} > 1) { print "Find_lines, pattern not found: $pattern\n"; } return ""; } sub stop_program { my $msg = shift; if($msg ne "") { print $msg; } if($imap) { $imap->logout or die "Could not logout: $@\n"; } if($options{log_file}) { close(LOG); select STDOUT; } exit; } sub sanity_checks { #verify that at least one bayes file exists if(!-e "$options{bayes_dir}/bayes_toks") { stop_program("Error: the bayes directory $options{bayes_dir} does not seem to contain bayes_toks. Exiting"); } #check that the folders for spam/ham exist. create if necessary my @folders = $imap->folders or die "Could not list folders: $@\n"; my $have_spam_folder = 0; my $have_ham_folder = 0; foreach my $folder (@folders) { if($folder =~ /^$options{spam_folder}$/i) { $have_spam_folder = 1; } if($folder =~ /^$options{ham_folder}$/i) { $have_ham_folder = 1; } if($have_spam_folder && $have_ham_folder) { last; } } if(!$have_spam_folder) { print "Spam destination folder [$options{spam_folder}] does not exist. Creating.\n"; #create the folder $imap->create($options{spam_folder}) or stop_program("Could not create spam destination folder [$options{spam_folder}]: $@\n"); } if(!$have_ham_folder) { print "Ham destination folder [$options{ham_folder}] does not exist. Creating.\n"; #create the folder $imap->create($options{ham_folder}) or stop_program("Could not create ham destination folder [$options{ham_folder}]: $@\n"); } } __END__ =head1 NAME Spamsort - A script to sort forwarded spam/ham into separate IMAP folders for processing by sa-learn =head1 SYNOPSIS Usage: spamsort [options] =head1 OPTIONS =over 8 =item B<-help> Print a brief help message and exits. =item B<-man> Prints the manual page and exits. =item B<-h | --mail_host=> address of IMAP server (Default=127.0.0.1) =item B<-u | --mail_user=> IMAP account name (Default=spam_account) =item B<-p | --mail_password=> IMAP password (Default=password) =item B<-i | --inbox_folder=> Inbox folder name (Default=INBOX) =item B<-b | --bayes_dir=> Directory containing bayesian db files (Default=~/.spamassassin) =item B<-l | --log_file=> Log file location. If not set, output is sent to stdout (Default=unset) =item B<-a | --spam_address=> The 'To:' header email address to parse as spam, for inbox messages only. (Default=is_spam) =item B<-A | --ham_address=> The 'To:' header email address to parse as ham, for inbox messages only. (Default=not_spam) =item B<-f | --spam_folder=> The IMAP folder where messages marked as spam are to be handed off to sa-learn as spam. Messages may be placed here manually via an IMAP copy/move for better header recognition. (Default=is_spam) =item B<-F | --ham_folder=> The IMAP folder where messages marked as ham are to be handed off to sa-learn as non-spam. Messages may be placed here manually via an IMAP copy/move for better header recognition. (Default=not_spam) =item B<-r | --restrict_from=> Only accept messages in the INBOX folder with a from address matching this argument. (Default=unset) =item B<-d | --rm_filtered_msgs=> After processing, remove all processed messages from the spam/ham folders. (Default=1) =item B<-D | --rm_inbox_msgs=> After processing, remove all processed messages from the inbox folder. (Default=0) =item B<-m | --filtered_unread_msgs_only=> Process only unread messages in the spam/ham folders. (Default=0) =item B<-M | --inbox_unread_only=> Process only unread messages in the inbox folder. (Default=1) =item B<-v | --verbose=> Level of verbosity. include more than once for more output. =back =head1 DESCRIPTION B connects to an IMAP account matching the settings specified via the command line. It will then process the messages in the inbox and look for the 'To' header to determine weather the message is spam or ham. A set of sub folders is used to hold the individual spam/ham messages where they are passed off to sa-learn. =cut