119/scrapeArchive.py

import re
import urllib2
import json

def parseList(filePath):
  f = open(filePath)
  urlList = []
  for l in f:
    if l.startswith("<a"):
      id = re.compile(r'details/(.*?)">').findall(l)
      url = "http://archive.org/details/" + id[0]
      urlList.append({'id': id, 'url': url})
  f.close()
  return urlList

def getOgvPath(page):
  filename = re.compile(r'IAD\.ogv\ \=\ "(.*?)"').findall(page)
  server = re.compile(r'IAD.meta\ \=\ \{"server"\:\"(.*?)"').findall(page)
  itemno = re.compile(r'IAD.thumbs\ \=\ \["\/([0-9])\/').findall(page)
  path = "http://%s/%s/items/%s" % (server[0], itemno[0], filename[0],)
  return path

def do(filePath):
  urls = parseList(filePath)
  ogvs = []
  for u in urls:
    try:
      page = urllib2.urlopen(u['url']).read()
      ogvPath = getOgvPath(page)
      id = u['id']    
      ogvs.append({'id': id, 'path': ogvPath})
    except:
      print u['url']
  return ogvs
added files 2010-11-23 23:20:33 +00:00			`import re`
			`import urllib2`
			`import json`

			`def parseList(filePath):`
			`f = open(filePath)`
			`urlList = []`
			`for l in f:`
			`if l.startswith("<a"):`
			`id = re.compile(r'details/(.*?)">').findall(l)`
			`url = "http://archive.org/details/" + id[0]`
			`urlList.append({'id': id, 'url': url})`
			`f.close()`
			`return urlList`

			`def getOgvPath(page):`
			`filename = re.compile(r'IAD\.ogv\ \=\ "(.*?)"').findall(page)`
			`server = re.compile(r'IAD.meta\ \=\ \{"server"\:\"(.*?)"').findall(page)`
			`itemno = re.compile(r'IAD.thumbs\ \=\ \["\/([0-9])\/').findall(page)`
			`path = "http://%s/%s/items/%s" % (server[0], itemno[0], filename[0],)`
			`return path`

			`def do(filePath):`
			`urls = parseList(filePath)`
			`ogvs = []`
			`for u in urls:`
			`try:`
			`page = urllib2.urlopen(u['url']).read()`
			`ogvPath = getOgvPath(page)`
			`id = u['id']`
			`ogvs.append({'id': id, 'path': ogvPath})`
			`except:`
			`print u['url']`
			`return ogvs`