imaginary-realities/update.py

"""
Author: Richard Tew <richard.m.tew@gmail.com>

This script can either be invoked manually, if required, or periodically by
having a scheduler like cron invoke it.

It is not necessary to factor in the usage limits of the services it polls
(Reddit, Disqus, ...) as the script will do that itself and only access them
if a minimum amount of time has passed.

"""

# What we can use from the standard library.
import codecs
import collections
import ConfigParser
import datetime
import email.utils
import json
import os
import sys
import time
import types

# The external burden of required dependencies.
import praw
from disqusapi import DisqusAPI

# The burden of our own module.
import gensite

r_username = "rmtew"
r_platform = "python"
r_appname = "siteshitter"
r_appversion = "v0.1"

r_useragent = "%s:%s:%s (by /u/%s)" % (r_platform, r_appname, r_appversion, r_username)

SITE_URL = "http://journal.imaginary-realities.com/"


def make_time_string(value):
    """ It's quicker to write this than to find something that does it and know it works. """
    if type(value) in (int, float):
        value = int(value)
        hours = value / (60 * 60)
        value = value % (60 * 60)
        minutes = value / 60
        value = value % 60
        seconds = value
    else:
        hours, minutes, seconds = [ int(v) for v in value.split(":") ]

    if hours:
        if hours >= 24:
            days = hours / 24
            if days >= 7:
                if days >= 30:
                    months = int(days / (365/12.0))
                    if months >= 12:
                        years = days / 365
                        time_string = "%d year%s" % (years, "" if years == 1 else "s")
                    else:
                        time_string = "%d month%s" % (months, "" if months == 1 else "s")
                else:
                    weeks = days / 7
                    time_string = "%d week%s" % (weeks, "" if weeks == 1 else "s")
            else:
                time_string = "%d day%s" % (days, "" if days == 1 else "s")
        else:
            time_string = "%d hour%s" % (hours, "" if hours == 1 else "s")
    elif minutes:
        time_string = "%d minute%s" % (minutes, "" if minutes == 1 else "s")
    else:
        time_string = "%d second%s" % (seconds, "" if seconds == 1 else "s")
    return time_string

def run_reddit(update=False):
    # NOTE(rmtew): Until I work out what values to display, this will store all the
    #     data that can be reconciled as storable.
    if update:
        r = praw.Reddit(user_agent=r_useragent)
        submissions = r.search("url:'%s'" % SITE_URL, sort="new")
        header = None
        lines = []
        string_columns = set()
        for match in submissions:
            if header is None:
                header = []
                for k in dir(match):
                    if k[0] == "_" or k in ("reddit_session", "comments"): continue
                    v = getattr(match,k)
                    if type(v) in (types.MethodType, types.FunctionType): continue
                    if type(v) in types.StringTypes:
                        header.append(k)
                    elif type(v) in (bool, int, float, types.NoneType, dict, list):
                        header.append(k)
                    elif type(v) in (types.ClassType, types.TypeType):
                        string_columns.add(k)
                        header.append(k)

            line = []
            for column_name in header:
                v = getattr(match, column_name, None)
                if column_name in string_columns:
                    line.append(unicode(v))
                else:
                    line.append(v)
            lines.append(line)

        with codecs.open("reddit.json", "wb", "utf-8") as f:
            json.dump((header, lines), f)

def run_disqus(update=False):
    # Get the absolute path of the directory the script is located in.
    script_path = os.path.dirname(__file__)
    if not len(script_path):
        script_path = sys.path[0]

    config = ConfigParser.ConfigParser()
    config.read([
        os.path.join(script_path, "ir-config.ini"),
    ])

    # If this is not an absolute path, make it one based on the script directory.
    data_path = config.get("paths", "data-storage")
    if not os.path.isabs(data_path):
        data_path = os.path.abspath(os.path.join(script_path, data_path))

    # TODO: Load in persisted data.
    persists = {}
    datafile_path = os.path.join(data_path, "data.json")
    if os.path.exists(datafile_path):
        persists = json.load(open(datafile_path, "rb"))

    if update:
        SECRET_KEY = config.get("disqus-keys", "private")
        PUBLIC_KEY = config.get("disqus-keys", "public")
        disqus = DisqusAPI(SECRET_KEY, PUBLIC_KEY)

        thread_data = persists.get("disqus-thread-data", {})
        _comment_threads = set()

        # ....
        # comment_timestamp = time.time()
        comment_data = []
        for comment in disqus.posts.list(forum="imaginaryrealities"):
            if comment["isSpam"] or comment["isDeleted"]:
                continue

            # Get the RFC3339 date string from disqus, assume it's UTC, and convert it to a timestamp.
            timeseq = list(time.strptime(comment["createdAt"], '%Y-%m-%dT%H:%M:%S'))
            timeseq.append(0) # offset of date's timezone from UTC.
            timeseq = tuple(timeseq)
            post_timestamp = email.utils.mktime_tz(timeseq) # UTC seconds since the epoch

            poster_username = comment["author"]["name"]
            post_commentid = comment["id"]
            post_threadid = comment["thread"]
            post_text = comment["raw_message"]
            comment_data.append((post_commentid, poster_username, post_timestamp, post_threadid, post_text))

            # Track the threads which the processed comments belong to.
            _comment_threads.add(post_threadid)

        # Check which threads have new comments which we do not know about.
        _unknown_threads = _comment_threads - set(thread_data)
        if len(_unknown_threads):
            print "Processing new threads."
            thread_timestamp = persists.get("disqus-thread-timestamp", '1333256400')
            # TODO: disqus does not like a since value, so cannot pass one yet until why, is determined.
            for result in disqus.threads.list(forum="imaginaryrealities", limit=20):#, since=thread_timestamp):
                thread_id = result["id"]
                thread_entry = [ result["link"], result["feed"], result["clean_title"] ]
                thread_data[thread_id] = thread_entry
            persists["disqus-thread-timestamp"] = time.time()

        _unknown_threads = _comment_threads - set(thread_data)
        if len(_unknown_threads):
            print "ERROR: still have %d unknown threads" % len(_unknown_threads)

        persists["disqus-comment-data"] = comment_data
        persists["disqus-thread-data"] = thread_data
        json.dump(persists, open(datafile_path, "wb"))

    return persists


def run_tests():
    def test_make_time_string(s_in, s_out):
        s_out_actual = make_time_string(s_in)
        if s_out_actual != s_out:
            raise Exception("Expected '%s'; got '%s'" % (s_out, s_out_actual))
    test_make_time_string("00:00:00", "0 seconds")
    test_make_time_string("00:00:01", "1 second")
    test_make_time_string("00:00:59", "59 seconds")
    test_make_time_string("00:01:00", "1 minute")
    test_make_time_string("00:59:00", "59 minutes")
    test_make_time_string("01:00:00", "1 hour")
    test_make_time_string("23:00:00", "23 hours")
    test_make_time_string("24:00:00", "1 day")
    test_make_time_string("%02d:00:00" % (24*6+23), "6 days")
    test_make_time_string("%02d:00:00" % (24*7), "1 week")
    test_make_time_string("%02d:00:00" % (24*7*3), "3 weeks")
    test_make_time_string("%02d:00:00" % (24*7*5), "1 month")
    test_make_time_string("%02d:00:00" % (24*7*5*2), "2 months")
    test_make_time_string("%02d:00:00" % (24*365), "1 year")


class disqus_data_wrapper(object):
    comment_class = collections.namedtuple("comment_class", [ "comment_id", "user_name", "timestamp", "thread_id", "text" ])
    thread_class = collections.namedtuple("thread_class", [ "url", "feed", "title" ])

    def __init__(self, data):
        self.data = data

    def get_recent_comments(self, limit=6):
        # For now, disqus provides the comments from newest to oldest.
        print self.data.keys()
        return [
            self.comment_class(*entry)
            for entry
            in self.data.get("disqus-comment-data", [])
        ][:limit]

    def get_thread(self, thread_id):
        return self.thread_class(*self.data.get("disqus-thread-data", {})[thread_id])

    def get_time_string(self, timestamp):
        return make_time_string(time.time() - timestamp)

class reddit_data_wrapper(object):
    def __init__(self, data):
        self.data = data


def run():
    args = set(sys.argv[1:])

    if "tests" in args:
        print "Running tests.."
        run_tests()
        print "..done"
        sys.exit(1)

    update_disqus = "disqus" in args or "all" in args
    update_reddit = "reddit" in args # or "all" in args
    update_website = "website" in args or "all" in args

    disqus_data = run_disqus(update=update_disqus)
    reddit_data = run_reddit(update=update_reddit)

    if update_website:
        gensite_targets = (gensite.TARGET_WEBSITE, )# | gensite.FLAG_ONLINE, )
        gensite.run(gensite_targets,
            disqus_data=disqus_data_wrapper(disqus_data),
            reddit_data=reddit_data_wrapper(reddit_data))
        # TODO(rmtew): Put the resulting website in place.  Set permissions.


if __name__ == "__main__":
    run()