import urllib2 import re from texts.models import * NEXT = "http://next.pad.ma/" OLD = "http://pad.ma/" IdMapping = {} regex = re.compile(r'(http:\/\/.*?pad.ma\/(V.*?))[\s\"\<\n]') regex2 = re.compile(r'(http:\/\/.*?pad.ma\/(V.*?))$') def getNewUrl(oldUrl): new = re.sub(regex2, lambda pat: NEXT + pat.groups()[1], oldUrl) # print new # new = oldUrl.replace(OLD, NEXT) try: return urllib2.urlopen(new).url except: print "ERROR: " + oldUrl return None def getNewId(oldId): oldId = oldId.group().replace(OLD, "")[:-1] print oldId if oldId in IdMapping: return IdMapping[oldId] url = NEXT + oldId u = urllib2.urlopen(url) newId = u.url.replace(NEXT, "") IdMapping[oldId] = newId return newId def replaceUrls(text): for f in re.findall(regex, text): newUrl = getNewUrl(f[0]) if newUrl is None: continue # print newUrl text = text.replace(f[0], newUrl) return text # for m in re.findall(regex, text): # print getNewId(m) def doAll(): for p in PadmaText.objects.all(): print p.title newText = replaceUrls(p.html) p.html = newText p.save()