commit bbdf8b1868a21006fc8eaae72b35d16c5183f1d8 Author: Lunar lunar@torproject.org Date: Fri Oct 31 14:32:40 2014 +0100
Add script to sort out spam from ham depending on ticket status --- train-spam-filters/train_spam_filters | 119 +++++++++++++++++++++++++++++++++ 1 file changed, 119 insertions(+)
diff --git a/train-spam-filters/train_spam_filters b/train-spam-filters/train_spam_filters new file mode 100755 index 0000000..c8f64dc --- /dev/null +++ b/train-spam-filters/train_spam_filters @@ -0,0 +1,119 @@ +#!/usr/bin/python +# +# This program is free software. It comes without any warranty, to +# the extent permitted by applicable law. You can redistribute it +# and/or modify it under the terms of the Do What The Fuck You Want +# To Public License, Version 2, as published by Sam Hocevar. See +# http://sam.zoy.org/wtfpl/COPYING for more details. + +from __future__ import print_function + +import email.parser +import psycopg2 +import os +import os.path +from datetime import datetime, timedelta + +DEBUG = False + +MAILDIR_ROOT = os.path.join(os.environ['HOME'], 'Maildir') +SPAM_MAILDIR = '.spam.learn' +HAM_MAILDIR = '.xham.learn' + +KEEP_FOR_MAX_DAYS = 100 + +RT_CONNINFO = "host=drobovi.torproject.org sslmode=require user=rtreader dbname=rt" + +SELECT_HAM_TICKET_QUERY = """ + SELECT DISTINCT Tickets.Id + FROM Queues, Tickets, Transactions + LEFT OUTER JOIN Attachments ON Attachments.TransactionId = Transactions.Id + WHERE Queues.Name LIKE 'help%%' + AND Tickets.Queue = Queues.Id + AND Tickets.Status = 'resolved' + AND Transactions.ObjectId = Tickets.Id + AND Transactions.ObjectType = 'RT::Ticket' + AND Attachments.MessageId = %s; +""" + +SELECT_SPAM_TICKET_QUERY = """ + SELECT DISTINCT Tickets.Id + FROM Queues, Tickets, Transactions + LEFT OUTER JOIN Attachments ON Attachments.TransactionId = Transactions.Id + WHERE Queues.Name = 'spam' + AND Tickets.Queue = Queues.Id + AND Tickets.Status = 'rejected' + AND Transactions.ObjectId = Tickets.Id + AND Transactions.ObjectType = 'RT::Ticket' + AND Attachments.MessageId = %s; +""" + +EMAIL_PARSER = email.parser.Parser() + +if DEBUG: + def log(msg): + print(msg) +else: + def log(msg): + pass + +def is_ham(msg_id): + global con + + cur = con.cursor() + try: + cur.execute(SELECT_HAM_TICKET_QUERY, (msg_id,)) + return cur.fetchone() is not None + finally: + cur.close() + +def is_spam(msg_id): + global con + + cur = con.cursor() + try: + cur.execute(SELECT_SPAM_TICKET_QUERY, (msg_id,)) + return cur.fetchone() is not None + finally: + cur.close() + +def handle_message(path): + msg = EMAIL_PARSER.parse(open(path), headersonly=True) + msg_id = msg['Message-Id'] + if not msg_id.startswith('<') or not msg_id.endswith('>'): + log("%s: bad Message-Id, removing." % path) + print("os.unlink(" + path) + return + msg_id = msg_id[1:-1] + if is_ham(msg_id): + os.rename(path, os.path.join(MAILDIR_ROOT, HAM_MAILDIR, 'cur', os.path.basename(path))) + log("%s: ham, moving." % path) + return + if is_spam(msg_id): + os.rename(path, os.path.join(MAILDIR_ROOT, SPAM_MAILDIR, 'cur', os.path.basename(path))) + log("%s: spam, moving." % path) + return + mtime = datetime.fromtimestamp(os.stat(path).st_mtime) + limit = datetime.now() - timedelta(days=KEEP_FOR_MAX_DAYS) + if mtime <= limit: + log("%s: too old, removing." % path) + print("os.unlink(" + path) + return + # well, it's not identified ham, not identified spam, and not too old + # let's keep the message for now + log("%s: unknown, keeping." % path) + +def scan_directory(dir_path): + for filename in os.listdir(dir_path): + path = os.path.join(dir_path, filename) + handle_message(path) + +con = None + +if __name__ == '__main__': + con = psycopg2.connect(RT_CONNINFO) + for filename in os.listdir(MAILDIR_ROOT): + if filename.startswith('.help'): + for subdir in ['new', 'cur', 'tmp']: + scan_directory(os.path.join(MAILDIR_ROOT, filename, subdir)) + con.close()
tor-commits@lists.torproject.org