""" Author: Richard Tew This script can either be invoked manually, if required, or periodically by having a scheduler like cron invoke it. It is not necessary to factor in the usage limits of the services it polls (Reddit, Disqus, ...) as the script will do that itself and only access them if a minimum amount of time has passed. """ # What we can use from the standard library. import codecs import collections import ConfigParser import datetime import email.utils import json import os import sys import time import types # The external burden of required dependencies. import praw from disqusapi import DisqusAPI # The burden of our own module. import gensite r_username = "rmtew" r_platform = "python" r_appname = "siteshitter" r_appversion = "v0.1" r_useragent = "%s:%s:%s (by /u/%s)" % (r_platform, r_appname, r_appversion, r_username) SITE_URL = "http://journal.imaginary-realities.com/" def make_time_string(value): """ It's quicker to write this than to find something that does it and know it works. """ if type(value) in (int, float): value = int(value) hours = value / (60 * 60) value = value % (60 * 60) minutes = value / 60 value = value % 60 seconds = value else: hours, minutes, seconds = [ int(v) for v in value.split(":") ] if hours: if hours >= 24: days = hours / 24 if days >= 7: if days >= 30: months = int(days / (365/12.0)) if months >= 12: years = days / 365 time_string = "%d year%s" % (years, "" if years == 1 else "s") else: time_string = "%d month%s" % (months, "" if months == 1 else "s") else: weeks = days / 7 time_string = "%d week%s" % (weeks, "" if weeks == 1 else "s") else: time_string = "%d day%s" % (days, "" if days == 1 else "s") else: time_string = "%d hour%s" % (hours, "" if hours == 1 else "s") elif minutes: time_string = "%d minute%s" % (minutes, "" if minutes == 1 else "s") else: time_string = "%d second%s" % (seconds, "" if seconds == 1 else "s") return time_string def run_reddit(update=False): # NOTE(rmtew): Until I work out what values to display, this will store all the # data that can be reconciled as storable. if update: r = praw.Reddit(user_agent=r_useragent) submissions = r.search("url:'%s'" % SITE_URL, sort="new") header = None lines = [] string_columns = set() for match in submissions: if header is None: header = [] for k in dir(match): if k[0] == "_" or k in ("reddit_session", "comments"): continue v = getattr(match,k) if type(v) in (types.MethodType, types.FunctionType): continue if type(v) in types.StringTypes: header.append(k) elif type(v) in (bool, int, float, types.NoneType, dict, list): header.append(k) elif type(v) in (types.ClassType, types.TypeType): string_columns.add(k) header.append(k) line = [] for column_name in header: v = getattr(match, column_name, None) if column_name in string_columns: line.append(unicode(v)) else: line.append(v) lines.append(line) with codecs.open("reddit.json", "wb", "utf-8") as f: json.dump((header, lines), f) def run_disqus(update=False): # Get the absolute path of the directory the script is located in. script_path = os.path.dirname(__file__) if not len(script_path): script_path = sys.path[0] config = ConfigParser.ConfigParser() config.read([ os.path.join(script_path, "ir-config.ini"), ]) # If this is not an absolute path, make it one based on the script directory. data_path = config.get("paths", "data-storage") if not os.path.isabs(data_path): data_path = os.path.abspath(os.path.join(script_path, data_path)) # TODO: Load in persisted data. persists = {} datafile_path = os.path.join(data_path, "data.json") if os.path.exists(datafile_path): persists = json.load(open(datafile_path, "rb")) if update: SECRET_KEY = config.get("disqus-keys", "private") PUBLIC_KEY = config.get("disqus-keys", "public") disqus = DisqusAPI(SECRET_KEY, PUBLIC_KEY) thread_data = persists.get("disqus-thread-data", {}) _comment_threads = set() # .... # comment_timestamp = time.time() comment_data = [] for comment in disqus.posts.list(forum="imaginaryrealities"): if comment["isSpam"] or comment["isDeleted"]: continue # Get the RFC3339 date string from disqus, assume it's UTC, and convert it to a timestamp. timeseq = list(time.strptime(comment["createdAt"], '%Y-%m-%dT%H:%M:%S')) timeseq.append(0) # offset of date's timezone from UTC. timeseq = tuple(timeseq) post_timestamp = email.utils.mktime_tz(timeseq) # UTC seconds since the epoch poster_username = comment["author"]["name"] post_commentid = comment["id"] post_threadid = comment["thread"] post_text = comment["raw_message"] comment_data.append((post_commentid, poster_username, post_timestamp, post_threadid, post_text)) # Track the threads which the processed comments belong to. _comment_threads.add(post_threadid) # Check which threads have new comments which we do not know about. _unknown_threads = _comment_threads - set(thread_data) if len(_unknown_threads): print "Processing new threads." thread_timestamp = persists.get("disqus-thread-timestamp", '1333256400') # TODO: disqus does not like a since value, so cannot pass one yet until why, is determined. for result in disqus.threads.list(forum="imaginaryrealities", limit=20):#, since=thread_timestamp): thread_id = result["id"] thread_entry = [ result["link"], result["feed"], result["clean_title"] ] thread_data[thread_id] = thread_entry persists["disqus-thread-timestamp"] = time.time() _unknown_threads = _comment_threads - set(thread_data) if len(_unknown_threads): print "ERROR: still have %d unknown threads" % len(_unknown_threads) persists["disqus-comment-data"] = comment_data persists["disqus-thread-data"] = thread_data json.dump(persists, open(datafile_path, "wb")) return persists def run_tests(): def test_make_time_string(s_in, s_out): s_out_actual = make_time_string(s_in) if s_out_actual != s_out: raise Exception("Expected '%s'; got '%s'" % (s_out, s_out_actual)) test_make_time_string("00:00:00", "0 seconds") test_make_time_string("00:00:01", "1 second") test_make_time_string("00:00:59", "59 seconds") test_make_time_string("00:01:00", "1 minute") test_make_time_string("00:59:00", "59 minutes") test_make_time_string("01:00:00", "1 hour") test_make_time_string("23:00:00", "23 hours") test_make_time_string("24:00:00", "1 day") test_make_time_string("%02d:00:00" % (24*6+23), "6 days") test_make_time_string("%02d:00:00" % (24*7), "1 week") test_make_time_string("%02d:00:00" % (24*7*3), "3 weeks") test_make_time_string("%02d:00:00" % (24*7*5), "1 month") test_make_time_string("%02d:00:00" % (24*7*5*2), "2 months") test_make_time_string("%02d:00:00" % (24*365), "1 year") class disqus_data_wrapper(object): comment_class = collections.namedtuple("comment_class", [ "comment_id", "user_name", "timestamp", "thread_id", "text" ]) thread_class = collections.namedtuple("thread_class", [ "url", "feed", "title" ]) def __init__(self, data): self.data = data def get_recent_comments(self, limit=6): # For now, disqus provides the comments from newest to oldest. print self.data.keys() return [ self.comment_class(*entry) for entry in self.data.get("disqus-comment-data", []) ][:limit] def get_thread(self, thread_id): return self.thread_class(*self.data.get("disqus-thread-data", {})[thread_id]) def get_time_string(self, timestamp): return make_time_string(time.time() - timestamp) class reddit_data_wrapper(object): def __init__(self, data): self.data = data def run(): args = set(sys.argv[1:]) if "tests" in args: print "Running tests.." run_tests() print "..done" sys.exit(1) update_disqus = "disqus" in args or "all" in args update_reddit = "reddit" in args # or "all" in args update_website = "website" in args or "all" in args disqus_data = run_disqus(update=update_disqus) reddit_data = run_reddit(update=update_reddit) if update_website: gensite_targets = (gensite.TARGET_WEBSITE, )# | gensite.FLAG_ONLINE, ) gensite.run(gensite_targets, disqus_data=disqus_data_wrapper(disqus_data), reddit_data=reddit_data_wrapper(reddit_data)) # TODO(rmtew): Put the resulting website in place. Set permissions. if __name__ == "__main__": run()