1
0
Fork 0
mirror of https://bitbucket.org/oreolek/imaginary-realities.git synced 2024-04-24 05:11:37 +03:00
imaginary-realities/update.py

262 lines
9.6 KiB
Python

"""
Author: Richard Tew <richard.m.tew@gmail.com>
This script can either be invoked manually, if required, or periodically by
having a scheduler like cron invoke it.
It is not necessary to factor in the usage limits of the services it polls
(Reddit, Disqus, ...) as the script will do that itself and only access them
if a minimum amount of time has passed.
"""
# What we can use from the standard library.
import codecs
import collections
import ConfigParser
import datetime
import email.utils
import json
import os
import sys
import time
import types
# The external burden of required dependencies.
import praw
from disqusapi import DisqusAPI
# The burden of our own module.
import gensite
r_username = "rmtew"
r_platform = "python"
r_appname = "siteshitter"
r_appversion = "v0.1"
r_useragent = "%s:%s:%s (by /u/%s)" % (r_platform, r_appname, r_appversion, r_username)
SITE_URL = "http://journal.imaginary-realities.com/"
def make_time_string(value):
""" It's quicker to write this than to find something that does it and know it works. """
if type(value) in (int, float):
value = int(value)
hours = value / (60 * 60)
value = value % (60 * 60)
minutes = value / 60
value = value % 60
seconds = value
else:
hours, minutes, seconds = [ int(v) for v in value.split(":") ]
if hours:
if hours >= 24:
days = hours / 24
if days >= 7:
if days >= 30:
months = int(days / (365/12.0))
if months >= 12:
years = days / 365
time_string = "%d year%s" % (years, "" if years == 1 else "s")
else:
time_string = "%d month%s" % (months, "" if months == 1 else "s")
else:
weeks = days / 7
time_string = "%d week%s" % (weeks, "" if weeks == 1 else "s")
else:
time_string = "%d day%s" % (days, "" if days == 1 else "s")
else:
time_string = "%d hour%s" % (hours, "" if hours == 1 else "s")
elif minutes:
time_string = "%d minute%s" % (minutes, "" if minutes == 1 else "s")
else:
time_string = "%d second%s" % (seconds, "" if seconds == 1 else "s")
return time_string
def run_reddit(update=False):
# NOTE(rmtew): Until I work out what values to display, this will store all the
# data that can be reconciled as storable.
if update:
r = praw.Reddit(user_agent=r_useragent)
submissions = r.search("url:'%s'" % SITE_URL, sort="new")
header = None
lines = []
string_columns = set()
for match in submissions:
if header is None:
header = []
for k in dir(match):
if k[0] == "_" or k in ("reddit_session", "comments"): continue
v = getattr(match,k)
if type(v) in (types.MethodType, types.FunctionType): continue
if type(v) in types.StringTypes:
header.append(k)
elif type(v) in (bool, int, float, types.NoneType, dict, list):
header.append(k)
elif type(v) in (types.ClassType, types.TypeType):
string_columns.add(k)
header.append(k)
line = []
for column_name in header:
v = getattr(match, column_name, None)
if column_name in string_columns:
line.append(unicode(v))
else:
line.append(v)
lines.append(line)
with codecs.open("reddit.json", "wb", "utf-8") as f:
json.dump((header, lines), f)
def run_disqus(update=False):
# Get the absolute path of the directory the script is located in.
script_path = os.path.dirname(__file__)
if not len(script_path):
script_path = sys.path[0]
config = ConfigParser.ConfigParser()
config.read([
os.path.join(script_path, "ir-config.ini"),
])
# If this is not an absolute path, make it one based on the script directory.
data_path = config.get("paths", "data-storage")
if not os.path.isabs(data_path):
data_path = os.path.abspath(os.path.join(script_path, data_path))
# TODO: Load in persisted data.
persists = {}
datafile_path = os.path.join(data_path, "data.json")
if os.path.exists(datafile_path):
persists = json.load(open(datafile_path, "rb"))
if update:
SECRET_KEY = config.get("disqus-keys", "private")
PUBLIC_KEY = config.get("disqus-keys", "public")
disqus = DisqusAPI(SECRET_KEY, PUBLIC_KEY)
thread_data = persists.get("disqus-thread-data", {})
_comment_threads = set()
# ....
# comment_timestamp = time.time()
comment_data = []
for comment in disqus.posts.list(forum="imaginaryrealities"):
if comment["isSpam"] or comment["isDeleted"]:
continue
# Get the RFC3339 date string from disqus, assume it's UTC, and convert it to a timestamp.
timeseq = list(time.strptime(comment["createdAt"], '%Y-%m-%dT%H:%M:%S'))
timeseq.append(0) # offset of date's timezone from UTC.
timeseq = tuple(timeseq)
post_timestamp = email.utils.mktime_tz(timeseq) # UTC seconds since the epoch
poster_username = comment["author"]["name"]
post_commentid = comment["id"]
post_threadid = comment["thread"]
post_text = comment["raw_message"]
comment_data.append((post_commentid, poster_username, post_timestamp, post_threadid, post_text))
# Track the threads which the processed comments belong to.
_comment_threads.add(post_threadid)
# Check which threads have new comments which we do not know about.
_unknown_threads = _comment_threads - set(thread_data)
if len(_unknown_threads):
print "Processing new threads."
thread_timestamp = persists.get("disqus-thread-timestamp", '1333256400')
# TODO: disqus does not like a since value, so cannot pass one yet until why, is determined.
for result in disqus.threads.list(forum="imaginaryrealities", limit=20):#, since=thread_timestamp):
thread_id = result["id"]
thread_entry = [ result["link"], result["feed"], result["clean_title"] ]
thread_data[thread_id] = thread_entry
persists["disqus-thread-timestamp"] = time.time()
_unknown_threads = _comment_threads - set(thread_data)
if len(_unknown_threads):
print "ERROR: still have %d unknown threads" % len(_unknown_threads)
persists["disqus-comment-data"] = comment_data
persists["disqus-thread-data"] = thread_data
json.dump(persists, open(datafile_path, "wb"))
return persists
def run_tests():
def test_make_time_string(s_in, s_out):
s_out_actual = make_time_string(s_in)
if s_out_actual != s_out:
raise Exception("Expected '%s'; got '%s'" % (s_out, s_out_actual))
test_make_time_string("00:00:00", "0 seconds")
test_make_time_string("00:00:01", "1 second")
test_make_time_string("00:00:59", "59 seconds")
test_make_time_string("00:01:00", "1 minute")
test_make_time_string("00:59:00", "59 minutes")
test_make_time_string("01:00:00", "1 hour")
test_make_time_string("23:00:00", "23 hours")
test_make_time_string("24:00:00", "1 day")
test_make_time_string("%02d:00:00" % (24*6+23), "6 days")
test_make_time_string("%02d:00:00" % (24*7), "1 week")
test_make_time_string("%02d:00:00" % (24*7*3), "3 weeks")
test_make_time_string("%02d:00:00" % (24*7*5), "1 month")
test_make_time_string("%02d:00:00" % (24*7*5*2), "2 months")
test_make_time_string("%02d:00:00" % (24*365), "1 year")
class disqus_data_wrapper(object):
comment_class = collections.namedtuple("comment_class", [ "comment_id", "user_name", "timestamp", "thread_id", "text" ])
thread_class = collections.namedtuple("thread_class", [ "url", "feed", "title" ])
def __init__(self, data):
self.data = data
def get_recent_comments(self, limit=6):
# For now, disqus provides the comments from newest to oldest.
print self.data.keys()
return [
self.comment_class(*entry)
for entry
in self.data.get("disqus-comment-data", [])
][:limit]
def get_thread(self, thread_id):
return self.thread_class(*self.data.get("disqus-thread-data", {})[thread_id])
def get_time_string(self, timestamp):
return make_time_string(time.time() - timestamp)
class reddit_data_wrapper(object):
def __init__(self, data):
self.data = data
def run():
args = set(sys.argv[1:])
if "tests" in args:
print "Running tests.."
run_tests()
print "..done"
sys.exit(1)
update_disqus = "disqus" in args or "all" in args
update_reddit = "reddit" in args # or "all" in args
update_website = "website" in args or "all" in args
disqus_data = run_disqus(update=update_disqus)
reddit_data = run_reddit(update=update_reddit)
if update_website:
gensite_targets = (gensite.TARGET_WEBSITE, )# | gensite.FLAG_ONLINE, )
gensite.run(gensite_targets,
disqus_data=disqus_data_wrapper(disqus_data),
reddit_data=reddit_data_wrapper(reddit_data))
# TODO(rmtew): Put the resulting website in place. Set permissions.
if __name__ == "__main__":
run()