119/scrapeArchive.py
2010-11-24 00:20:33 +01:00

35 lines
918 B
Python

import re
import urllib2
import json
def parseList(filePath):
f = open(filePath)
urlList = []
for l in f:
if l.startswith("<a"):
id = re.compile(r'details/(.*?)">').findall(l)
url = "http://archive.org/details/" + id[0]
urlList.append({'id': id, 'url': url})
f.close()
return urlList
def getOgvPath(page):
filename = re.compile(r'IAD\.ogv\ \=\ "(.*?)"').findall(page)
server = re.compile(r'IAD.meta\ \=\ \{"server"\:\"(.*?)"').findall(page)
itemno = re.compile(r'IAD.thumbs\ \=\ \["\/([0-9])\/').findall(page)
path = "http://%s/%s/items/%s" % (server[0], itemno[0], filename[0],)
return path
def do(filePath):
urls = parseList(filePath)
ogvs = []
for u in urls:
try:
page = urllib2.urlopen(u['url']).read()
ogvPath = getOgvPath(page)
id = u['id']
ogvs.append({'id': id, 'path': ogvPath})
except:
print u['url']
return ogvs