#!/bin/bash
#
# teach-sa.sh
#
# Version : 0.5.1
# Latest version is always available at http://www.ruwenzori.net/code/teach-sa/
#
# What :
# This script reads mail in designated maildir folders (spam on the one
# hand, ham on the other) and feeds them to spamassassin for bayesian
# learning and submission to various spam detection schemes.
#
# It is good for implementing any sort of supervised training in
# addition to Spamassassin's unsupervised training (also known as
# automatic whitelist) while reducing training-related admin workload
# to nearly zero. Fits any setup storing mail as maildir, but could
# trivially be modified to work with mbox based systems.
#
# Dependancies :
# - archivemail
# - spamassassin
# - formail (comes with the procmail package)
# - spamreport.pl (distributed at http://www.tls.cena.fr/~boubaker/JunkTrap/spamreport.pl
#   and mirrored at http://www.ruwenzori.net/code/teach-sa/spamreport.html).
#
#   Place spamreport.pl in the working directory, where teach-sa.sh resides.
#
# Configuration : 
# This script works on my Debian system, it may require the edition of a
# few paths before it works on yours.
#
# The file CertainSpamFolderList in the working directory must contain a
# list of maildirs containing messages identified with certainty as spam
# (one maildir per line) !
#
# The file CertainHamFolderList in the working directory must contain a
# list of maildirs containing messages identified with certainty as ham
# (one maildir per line) !
#
# This program should have permissions to read/write users maildirs
#
# This program is a good candidate for launch with a high niceness.
#
# WARNING : the ham folder and the spam folder are both emptied during
# execution of this program. You probably won't cry over the gone spam
# but make sure that your users _copy_ messages to the ham folder instead
# of moving them there !
#
#
# Author : Jean-Marc Liotier
#
#
# License
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or (at
# your option) any later version.
#
#
# Changelog
#
# 0.5.1  Modified default ReportDestination addresses.

# ----------------------------------------------------------------------
# ------------ Mandatory user configurable variables -------------------
# ----------------------------------------------------------------------

# Full path of the directory where teach-sa.sh and the configuration files reside.
workdir="/home/jim/applications/teach-sa"

# Local postmaster address
ZePostmaster="postmaster@example.com"

# ----------------------------------------------------------------------
# ---------- End of mandatory user configurable variables --------------
# ----------------------------------------------------------------------


# ----------------------------------------------------------------------
# ------------ Optional user configurable variables --------------------
# ----------------------------------------------------------------------

# Subject of the mailed spam report
ReportSubject="Forwarded spam message with full headers"

# Spam report recipients addresses
ReportDestination="spam@uce.gov yourjunk@knujon.com"
# spamrecycle@chooseyourmail.com no longer works
# submit@spamarchive.org is chronically over quota

# ----------------------------------------------------------------------
# ----------- End of optional user configurable variables --------------
# ----------------------------------------------------------------------


# Todo :
# - Create end-user's self-provisioning front-end so that users can add 
#   or remove spam or ham folders. A php web page would probably be good
#   for that. Using PAM authentication would make it fit well with the
#   rest of the system, and we could get the home dir from there. The
#   maildir name we could get from /etc/maildroprc but that would be too
#   specific and we'll be better off making it a parameter in a
#   configuration file.
# - Study standard permissions schemes and try to find a way to run as
#   a user other than root.
# - Debian packaging.


# First let's forward spam to addresses who want it
for ZeCertainSpamFolder in `cat $workdir/CertainSpamFolderList`
  do
    for ZeCertainSpamIMAPSubFolder in cur new
      do
      	for ZeCertainSpamMessage in `ls -1 $ZeCertainSpamFolder/$ZeCertainSpamIMAPSubFolder`
          do
            for ZeRecipient in $ReportDestination
              do
                $workdir/spamreport.pl -s "ReportSubject" -f $ZePostmaster -t $ZeRecipient < $ZeCertainSpamFolder/$ZeCertainSpamIMAPSubFolder/$ZeCertainSpamMessage
              done
          done
      done
  done

tomorrow=`date -I --date '+1 day'`

# Move the certain spam messages from the maildirs to temporary mbox files
# This is needed because sa-learn groks mbox but not maildir
for ZeCertainSpamFolder in `cat $workdir/CertainSpamFolderList`
	do
		rm -f $ZeCertainSpamFolder.sa-learn.tmp
		# archivemail is not designed to archive current messages hence the --date=$tomorrow hack
		archivemail -q --include-flagged --no-compress --date=$tomorrow --suffix=.sa-learn.tmp $ZeCertainSpamFolder
	done

# Move the certain ham messages from the maildirs to temporary mbox files
# This is needed because sa-learn groks mbox but not maildir
for ZeCertainHamFolder in `cat $workdir/CertainHamFolderList`
	do
		rm -f $ZeCertainHamFolder.sa-learn.tmp
		# archivemail is not designed to archive current messages hence the --date=$tomorrow hack
		archivemail -q --include-flagged --no-compress --date=$tomorrow --suffix=.sa-learn.tmp $ZeCertainHamFolder
	done

# In order to access all user maildirs this script runs as root. sa-learn
# writes in the current users home directory and I have found no way to
# make it write anywhere else. Since the systemwide spamassassin bayesian
# filtering database resides in /var/mail/.spamassassin we have to resort
# to this symlinking hack. It is harmless since the root user was not
# supposed to use spamassassin anyway.
rm -rf /root/.spamassassin
ln -s /var/mail/.spamassassin /root/.spamassassin

# Feed the yucky spam in each temporary spam mbox to sa-learn and report it
for ZeCertainSpamFolder in `cat $workdir/CertainSpamFolderList`
	do
		if [ -f "$ZeCertainSpamFolder.sa-learn.tmp" ]
		  then
			# Spamassassin bayesian learning
			sa-learn --spam --no-sync --mbox $ZeCertainSpamFolder.sa-learn.tmp
			# Spamassassin manages the reporting to Razor, Pyzor and DCC
			# 'formail -s' is used to split the mailbox into individual messages
			# because spamassassin does not handle mbox files.
			formail -s spamassassin --report < $ZeCertainSpamFolder.sa-learn.tmp
			# Remove the temporary file
			rm -f $ZeCertainSpamFolder.sa-learn.tmp
		fi
	done

# Feed the yummy ham in each temporary ham mbox to sa-learn
for ZeCertainHamFolder in `cat $workdir/CertainHamFolderList`
	do
		if [ -f "$ZeCertainHamFolder.sa-learn.tmp" ]
		  then
			# Spamassassin bayesian learning
			sa-learn --ham --no-sync --mbox $ZeCertainHamFolder.sa-learn.tmp
			# Remove the temporary file
			rm -f $ZeCertainHamFolder.sa-learn.tmp
		fi
	done

# It is faster to rebuild after all the spam has been fed instead of doing
# it every time we feed a spam to spamassassin's sa-learn.
sa-learn --sync

# chown for the same reason as the /root/.spamassassin symlink hack : we are
# the root user but we act on behalf of the mail user
chown -R mail:mail /var/mail/.spamassassin