[tor-commits] [support-tools/master] Add script to sort out spam from ham depending on ticket status

lunar at torproject.org lunar at torproject.org
Fri Oct 31 13:32:52 UTC 2014


commit bbdf8b1868a21006fc8eaae72b35d16c5183f1d8
Author: Lunar <lunar at torproject.org>
Date:   Fri Oct 31 14:32:40 2014 +0100

    Add script to sort out spam from ham depending on ticket status
---
 train-spam-filters/train_spam_filters |  119 +++++++++++++++++++++++++++++++++
 1 file changed, 119 insertions(+)

diff --git a/train-spam-filters/train_spam_filters b/train-spam-filters/train_spam_filters
new file mode 100755
index 0000000..c8f64dc
--- /dev/null
+++ b/train-spam-filters/train_spam_filters
@@ -0,0 +1,119 @@
+#!/usr/bin/python
+#
+# This program is free software. It comes without any warranty, to
+# the extent permitted by applicable law. You can redistribute it
+# and/or modify it under the terms of the Do What The Fuck You Want
+# To Public License, Version 2, as published by Sam Hocevar. See
+# http://sam.zoy.org/wtfpl/COPYING for more details.
+
+from __future__ import print_function
+
+import email.parser
+import psycopg2
+import os
+import os.path
+from datetime import datetime, timedelta
+
+DEBUG = False
+
+MAILDIR_ROOT = os.path.join(os.environ['HOME'], 'Maildir')
+SPAM_MAILDIR = '.spam.learn'
+HAM_MAILDIR = '.xham.learn'
+
+KEEP_FOR_MAX_DAYS = 100
+
+RT_CONNINFO = "host=drobovi.torproject.org sslmode=require user=rtreader dbname=rt"
+
+SELECT_HAM_TICKET_QUERY = """
+    SELECT DISTINCT Tickets.Id
+      FROM Queues, Tickets, Transactions
+           LEFT OUTER JOIN Attachments ON Attachments.TransactionId = Transactions.Id
+     WHERE Queues.Name LIKE 'help%%'
+       AND Tickets.Queue = Queues.Id
+       AND Tickets.Status = 'resolved'
+       AND Transactions.ObjectId = Tickets.Id
+       AND Transactions.ObjectType = 'RT::Ticket'
+       AND Attachments.MessageId = %s;
+"""
+
+SELECT_SPAM_TICKET_QUERY = """
+    SELECT DISTINCT Tickets.Id
+      FROM Queues, Tickets, Transactions
+           LEFT OUTER JOIN Attachments ON Attachments.TransactionId = Transactions.Id
+     WHERE Queues.Name = 'spam'
+       AND Tickets.Queue = Queues.Id
+       AND Tickets.Status = 'rejected'
+       AND Transactions.ObjectId = Tickets.Id
+       AND Transactions.ObjectType = 'RT::Ticket'
+       AND Attachments.MessageId = %s;
+"""
+
+EMAIL_PARSER = email.parser.Parser()
+
+if DEBUG:
+    def log(msg):
+        print(msg)
+else:
+    def log(msg):
+        pass
+
+def is_ham(msg_id):
+    global con
+
+    cur = con.cursor()
+    try:
+        cur.execute(SELECT_HAM_TICKET_QUERY, (msg_id,))
+        return cur.fetchone() is not None
+    finally:
+        cur.close()
+
+def is_spam(msg_id):
+    global con
+
+    cur = con.cursor()
+    try:
+        cur.execute(SELECT_SPAM_TICKET_QUERY, (msg_id,))
+        return cur.fetchone() is not None
+    finally:
+        cur.close()
+
+def handle_message(path):
+    msg = EMAIL_PARSER.parse(open(path), headersonly=True)
+    msg_id = msg['Message-Id']
+    if not msg_id.startswith('<') or not msg_id.endswith('>'):
+        log("%s: bad Message-Id, removing." % path)
+        print("os.unlink(" + path)
+        return
+    msg_id = msg_id[1:-1]
+    if is_ham(msg_id):
+        os.rename(path, os.path.join(MAILDIR_ROOT, HAM_MAILDIR, 'cur', os.path.basename(path)))
+        log("%s: ham, moving." % path)
+        return
+    if is_spam(msg_id):
+        os.rename(path, os.path.join(MAILDIR_ROOT, SPAM_MAILDIR, 'cur', os.path.basename(path)))
+        log("%s: spam, moving." % path)
+        return
+    mtime = datetime.fromtimestamp(os.stat(path).st_mtime)
+    limit = datetime.now() - timedelta(days=KEEP_FOR_MAX_DAYS)
+    if mtime <= limit:
+        log("%s: too old, removing." % path)
+        print("os.unlink(" + path)
+        return
+    # well, it's not identified ham, not identified spam, and not too old
+    # let's keep the message for now
+    log("%s: unknown, keeping." % path)
+
+def scan_directory(dir_path):
+    for filename in os.listdir(dir_path):
+        path = os.path.join(dir_path, filename)
+        handle_message(path)
+        
+con = None
+
+if __name__ == '__main__':
+    con = psycopg2.connect(RT_CONNINFO)
+    for filename in os.listdir(MAILDIR_ROOT):
+        if filename.startswith('.help'):
+            for subdir in ['new', 'cur', 'tmp']:
+                scan_directory(os.path.join(MAILDIR_ROOT, filename, subdir))
+    con.close()



More information about the tor-commits mailing list