#!/usr/bin/perl # spamsort : a backend script to tune spamassassin using human flagged spam and ham (non-spam) messages. Written by: # Marc Swanson # M. Swanson Consulting # mswanson_NO_SPAM_@mswanson.com # http://www.mswanson.com #This code is designed to check two sets of email boxes: one flagged for spam, one flagged for ham (not spam). #new messages are checked via imap off of the server and parsed to appear as the original message (prior to forwarding) #before being actually passed to sa-learn. The typical use of this script is to setup a special mail user #(spam_sort for instance) and alias is_spam@yourdomain.com and not_spam@yourdomain.com. Then, #simply forward spam messages to is_spam@yourdomain.com and ham messages to not_spam@yourdomain.com, put this script #into cron for nightly processing, and your bayes db will be automatically updated. #IMPORTANT: be sure that you have two folders at the root of your imap account: is_spam and not_spam # (or whatever you set spam_account and ham_account to) #Tune the variables below to fit your implementation use Mail::IMAPClient; my $mail_host = '127.0.0.1'; #host running your IMAP server my $mail_user = 'spam_account'; #account used for this purpose my $mail_password = 'password'; #imap password my $inbox_folder = 'INBOX'; my $bayes_dir = "~/.spamassassin"; #the folder containing the bayes_* files my $bayes_owner = ""; #passed to chown command after modifying bayes db my $debug = 0; my $spam_count = 0; my $ham_count = 0; my $inbox_count = 0; my $logfile = "/var/log/spamfilter.log"; my $is_spam_regex = '^[<\'"]*is_spam@\S+\.\S{2,3}'; my $not_spam_regex = '^[<\'"]*not_spam@\S+\.\S{2,3}'; my $from_regex = ''; #if you want to be strict about who/what domains are accepted for submissions, put them in here #fails when copying/moving a message via imap rather than forwarding. my $msg_tmp = '/tmp/sa_msg_tmp.txt'; my $spam_account = 'is_spam'; my $ham_account = 'not_spam'; my $delete_filtered_msgs = 0; #if set to 1, after sa-learn is called on messages in is_spam and not_spam folders, message #is deleted (original in INBOX is preserved). Otherwise, message is marked as read. my $process_all_filtered_msgs = 1; #This allows the ability to move entire messages via imap to this folder #and the script will process them. NOTE, if you don't set the delete_filtered_msgs variable to '1' #along with this one, you will be re-processing all of your old, processed messages all over again. #this isn't an error, but will take longer for the script to run and chew unnecessary processing. my $delete_inbox_msgs = 0; #if set to 1, removes any messages in the INBOX folder after processing. # open log and make sure it is unbuffered. From now on all prints # will print to the log unless specified otherwise. open LOG, ">>$logfile" or stop_program("Can't open $logfile for write: $!\n"); select LOG; $| = 1; my $imap = Mail::IMAPClient->new; $imap = Mail::IMAPClient->new( Server => $mail_host, User => $mail_user, Password => $mail_password ) or stop_program("Cannot connect to $mail_host as $mail_user: $@"); #make it so that reading the message doesn't automatically mark it as read $imap->Peek(false); #process the inbox process_inbox(); #now, process any messages marked as 'new' in the is_spam and not_spam folders process_messages('spam'); process_messages('ham'); #finally, rebuild the bayes database system("sa-learn --rebuild"); #fix the permissions on spamassassin files to be owned by the original owner if($bayes_owner) { system("chown $bayes_owner $bayes_dir/bayes_*"); } #finish up print "Done! Processed $inbox_count new messages in inbox, $spam_count new spam messages, and $ham_count non-spam messages.\n"; stop_program(""); ####################################### #just subroutines below here sub process_inbox { print "Processing inbox: ", scalar localtime time, "\n"; #select the INBOX folder $imap->select($inbox_folder) or stop_program("Could not select: $@\n"); #look for unseen messages my @unseen = $imap->unseen or print "No unseen messages in inbox: $@\n"; #now, loop over the unseen messages.. foreach my $msg_id (@unseen) { $inbox_count++; if($debug > 1) { print "DEBUG:: processing unseen message: $msg_id\n"; } system("rm -f $msg_tmp"); my $type = ''; my $new_msg_string = ''; my $mime_boundary = ''; my $sub_mime_boundary = ''; my $mime_section = 0; my $fwd_type = ''; my $msg_from = $imap->get_header($msg_id, "From"); my $msg_to = $imap->get_header($msg_id, "To"); my $msg_subject = $imap->get_header($msg_id, "Subject"); my $msg_ident = "Msgid: $msg_id, subject: $msg_subject"; if($from_regex && !($msg_from =~ /$from_regex/)) { print "Warning: Inbox message [$msg_ident] does not match allowable from regex. Skipping.\n"; $imap->see(($msg_id)) or stop_program("Could not see message [$msg_ident]: $@\n"); next; } if($msg_to =~ /$is_spam_regex/i) { $type = 'spam'; print "Processing inbox message with id $msg_id, type is $type\n"; } elsif($msg_to =~ /$not_spam_regex/i) { $type = 'ham'; print "Processing inbox message with id $msg_id, type is $type\n"; } else { print "Warning: Inbox message [$msg_ident] spam/ham status could not be determined from to header: [$msg_to]. Skipping.\n"; $imap->see(($msg_id)) or stop_program("Could not see message [$msg_ident]: $@\n"); next; } #to avoid big memory usage, save the message as a file, then read it a line at a time if(!$imap->message_to_file($msg_tmp,($msg_id))) { print "Could not message_to_file on msg_id $msg_id: $@\n"; next; } #now, figure out what we have been given #grep through the headers to find the 'to' string open MSG, $msg_tmp or stop_program("could not read $msg_tmp for read: $!\n"); while() { if($mime_boundary eq "" && /^Content-Type:\s+multipart\/mixed;\s*boundary=["']{0,1}(.*?)["']{0,1}\s*$/i) { $mime_boundary = quotemeta($1); if($debug > 1) { print "DEBUG::process_inbox: detected master mime boundary: $mime_boundary\n"; } next; } if($mime_boundary ne "" && $_ =~ /^\s*--$mime_boundary\s*/) { $mime_section++; if($debug > 1) { print "DEBUG::process_inbox: detected new mime section. mime index: $mime_section\n"; } } if($mime_section == 2) { #this should be the forwarded bit #the tricky part is we need to handle single and multiple forwards if($debug > 1 && !$fwd_type) { print "DEBUG::process_inbox: reached mime section 2, have not yet determined message type\n"; } if(!$fwd_type && /^Content-Type:\s+multipart\/digest;\s*boundary=["']{0,1}(.*?)["']{0,1}\s*$/i) { $fwd_type = 'multi'; $sub_mime_boundary = quotemeta($1); if($debug > 1) { print "DEBUG::process_inbox: detected multiple-forwards type message, sub mime boundary: $sub_mime_boundary\n"; } last; } elsif(!$fwd_type && /^Content-Type:/i) { $fwd_type = 'single'; if($debug > 1) { print "DEBUG::process_inbox: message is of type single forward or original copy\n"; } last; } } } if($fwd_type eq 'single') { #go until we hit a space, followed by non-space find_lines('^\s*$',0); $new_msg_string = find_lines('^\S+:',0); $new_msg_string .= find_lines("^\\s*--$mime_boundary--\\s*\$",-1); if($debug > 1) { print "DEBUG:: message is of class single\n"; } create_message($type,$new_msg_string); } elsif($fwd_type eq 'multi' && $sub_mime_boundary) { #in this case, there is an indeterminate number of messages that we need to get start and stop lines for if($debug > 1) { print "DEBUG:: message is of class multi\n"; } find_lines("^\\s*--$sub_mime_boundary\\s*\$",0); while() { #go until we hit a space, followed by non-space find_lines('^\s*$',0); $new_msg_string = find_lines('^\S+:',0); my $line_no = $.; my $rest = find_lines("^\\s*--$sub_mime_boundary\\s*\$",-1); if($rest eq "") { #this must be the last message in the forward list #we need to rewind the fh to the last line number #easiest way is to re-open, and drill down to the line number close(MSG); open MSG, $msg_tmp or stop_program("could not re-open $msg_tmp for read: $!\n"); do { my $crap = } until $. == $line_no || eof; $rest = find_lines("^\\s*--$sub_mime_boundary--\\s*\$",-1); } if($rest ne "") { #now we should have the complete msg. save this as a new message in the appropriate folder $new_msg_string .= $rest; create_message($type,$new_msg_string); } } } else { #assume we literally want the whole message if($debug) { print "Assuming message should be treated as an original for [$msg_ident].\n"; } create_message($type,$imap->message_string($msg_id)); } if($delete_inbox_msgs) { #mark the message as read $imap->delete_message(($msg_id)) or stop_program("Could not delete message [$msg_ident]: $@\n"); if($debug > 1) { print "DEBUG:: deleted container message [$msg_ident].\n"; } } else { #mark the message as read $imap->see(($msg_id)) or stop_program("Could not see message [$msg_ident]: $@\n"); if($debug > 1) { print "DEBUG:: marked container message as read [$msg_ident].\n"; } } } } sub process_messages { my $type = shift; if($debug > 1) { print "DEBUG::process_messages type = $type\n"; } my $src_folder; if($type eq 'spam') { $src_folder = 'is_spam'; } elsif($type eq 'ham') { $src_folder = 'not_spam'; } else { print "Warning, asked to process an unknown type: $type\n"; return; } #select the indicated folder $imap->select($src_folder) or stop_program("Could not select $src_folder: $@\n"); #look for unseen messages my @msgs; if($process_all_filtered_msgs) { @msgs = $imap->messages or print "No messages in $src_folder: $@\n"; } else { @msgs = $imap->unseen or print "No unseen messages in $src_folder: $@\n"; } foreach my $msg_id (@msgs) { if($debug > 1) { print "DEBUG:: processing filtered message of type $type, msg_id $msg_id\n"; } #save this message to a temp file if(!$imap->message_to_file($msg_tmp,($msg_id))) { print "Could not message_to_file on $msg_id: $@\n"; return; } #now run sa-learn on this message learn_message($type,$msg_tmp); if($delete_filtered_msgs) { $imap->delete_message(($msg_id)) or stop_program("Could not delete message: $@\n"); } else { #and mark the message as read $imap->see(($msg_id)) or stop_program("Could not see: $@\n"); } if($debug > 1) { print "DEBUG:: marked filtered message as read.\n"; } if($type eq 'spam') { $spam_count++; } elsif($type eq 'ham') { $ham_count++; } } } sub create_message { my $type = shift; my $msg_string = shift; if(!$msg_string) { print "Warning: cannot create message. Empty contents.\n"; return; } if($debug > 1) { print "DEBUG::create_message: type = $type\n"; print "\tMessage content preview:[" . substr($msg_string,0,100) . "]\n"; } my $dst_folder; #ok, now we append this message to the appropriate folder depending on message type, and mark the original as read if($type eq 'spam') { $dst_folder = "is_spam"; } elsif($type eq 'ham') { $dst_folder = "not_spam"; } if($dst_folder && $msg_string ne "") { #strip leading and trailing space $msg_string =~ s/^\s*//; $msg_string =~ s/\s*$//; #check to make sure that there is at least a blank line between headers and body (only happens with null body) if(!($msg_string =~ /^\s*$/m)) { print "Warning: Appending a blank line to message, body was null.\n"; $msg_string .= "\n\n"; } my $new_msg_id = $imap->append($dst_folder,$msg_string) or print "Could not append, msg_string was $msg_string: $@\n"; if($debug > 1) { print "DEBUG:: created new message in folder $dst_folder with id $new_msg_id.\n"; } return $new_msg_id; } return 0; } sub learn_message { my $type = shift; my $msg_file = shift; if($debug > 1) { print "DEBUG::learn_message, type = $type, msg_file = $msg_file\n"; } my $command = "sa-learn --no-rebuild --$type --file $msg_file 1>/dev/null 2>&1"; system($command); if($debug > 1) { print "DEBUG:: ran sa-learn command: $command\n"; } #delete the temp file system("rm -f $msg_file"); } sub find_lines { my $pattern = shift; my $mode = shift; if($debug > 1) { print "DEBUG::find_lines, pattern = $pattern, mode = $mode\n"; } my $str = ""; if(/$pattern/) { if($debug > 1) { print "DEBUG::find_lines, pattern found on current line: $_\n"; } if($mode == 0) { return $_; } return $str; } while() { if(/$pattern/) { if($debug > 1) { print "DEBUG::find_lines, pattern found: $_\n"; } if($mode == 0) { return $_; } elsif($mode == -1) { return $str; } } $str .= $_; } if($debug > 1) { print "DEBUG::find_lines, pattern not found: $pattern\n"; } return ""; } sub stop_program { my $msg = shift; if($msg ne "") { print $msg; } if($imap) { $imap->logout or die "Could not logout: $@\n"; } close(LOG); select STDOUT; exit; }