From 0f7a3ce1772b28b4122e5f355912b0fdf07e02e0 Mon Sep 17 00:00:00 2001 From: sanj Date: Wed, 24 Nov 2010 00:20:33 +0100 Subject: [PATCH] added files --- scrapeArchive.py | 34 +++++++++++++++++++++++++++++++ test.html | 53 ++++++++++++++++++++---------------------------- vidList.js | 1 + 3 files changed, 57 insertions(+), 31 deletions(-) create mode 100644 scrapeArchive.py create mode 100644 vidList.js diff --git a/scrapeArchive.py b/scrapeArchive.py new file mode 100644 index 0000000..f397a4b --- /dev/null +++ b/scrapeArchive.py @@ -0,0 +1,34 @@ +import re +import urllib2 +import json + +def parseList(filePath): + f = open(filePath) + urlList = [] + for l in f: + if l.startswith("').findall(l) + url = "http://archive.org/details/" + id[0] + urlList.append({'id': id, 'url': url}) + f.close() + return urlList + +def getOgvPath(page): + filename = re.compile(r'IAD\.ogv\ \=\ "(.*?)"').findall(page) + server = re.compile(r'IAD.meta\ \=\ \{"server"\:\"(.*?)"').findall(page) + itemno = re.compile(r'IAD.thumbs\ \=\ \["\/([0-9])\/').findall(page) + path = "http://%s/%s/items/%s" % (server[0], itemno[0], filename[0],) + return path + +def do(filePath): + urls = parseList(filePath) + ogvs = [] + for u in urls: + try: + page = urllib2.urlopen(u['url']).read() + ogvPath = getOgvPath(page) + id = u['id'] + ogvs.append({'id': id, 'path': ogvPath}) + except: + print u['url'] + return ogvs diff --git a/test.html b/test.html index 48cea5a..b8788f1 100644 --- a/test.html +++ b/test.html @@ -4,7 +4,7 @@