119/scrapeArchive.py

import re
import urllib2
import json

def parseList(filePath):
  f = open(filePath)
  urlList = []
  for l in f:
    if l.startswith("<a"):
      id = re.compile(r'details/(.*?)">').findall(l)
      url = "http://archive.org/details/" + id[0]
      urlList.append({'id': id, 'url': url})
  f.close()
  return urlList

def getOgvPath(page):
  filename = re.compile(r'IAD\.ogv\ \=\ "(.*?)"').findall(page)
  server = re.compile(r'IAD.meta\ \=\ \{"server"\:\"(.*?)"').findall(page)
  itemno = re.compile(r'IAD.thumbs\ \=\ \["\/([0-9])\/').findall(page)
  path = "http://%s/%s/items/%s" % (server[0], itemno[0], filename[0],)
  return path

def do(filePath):
  urls = parseList(filePath)
  ogvs = []
  for u in urls:
    try:
      page = urllib2.urlopen(u['url']).read()
      ogvPath = getOgvPath(page)
      id = u['id']
      ogvs.append({'id': id, 'path': ogvPath})
    except:
      print u['url']
  return ogvs