35 lines
918 B
Python
35 lines
918 B
Python
|
import re
|
||
|
import urllib2
|
||
|
import json
|
||
|
|
||
|
def parseList(filePath):
|
||
|
f = open(filePath)
|
||
|
urlList = []
|
||
|
for l in f:
|
||
|
if l.startswith("<a"):
|
||
|
id = re.compile(r'details/(.*?)">').findall(l)
|
||
|
url = "http://archive.org/details/" + id[0]
|
||
|
urlList.append({'id': id, 'url': url})
|
||
|
f.close()
|
||
|
return urlList
|
||
|
|
||
|
def getOgvPath(page):
|
||
|
filename = re.compile(r'IAD\.ogv\ \=\ "(.*?)"').findall(page)
|
||
|
server = re.compile(r'IAD.meta\ \=\ \{"server"\:\"(.*?)"').findall(page)
|
||
|
itemno = re.compile(r'IAD.thumbs\ \=\ \["\/([0-9])\/').findall(page)
|
||
|
path = "http://%s/%s/items/%s" % (server[0], itemno[0], filename[0],)
|
||
|
return path
|
||
|
|
||
|
def do(filePath):
|
||
|
urls = parseList(filePath)
|
||
|
ogvs = []
|
||
|
for u in urls:
|
||
|
try:
|
||
|
page = urllib2.urlopen(u['url']).read()
|
||
|
ogvPath = getOgvPath(page)
|
||
|
id = u['id']
|
||
|
ogvs.append({'id': id, 'path': ogvPath})
|
||
|
except:
|
||
|
print u['url']
|
||
|
return ogvs
|