added comments to import_atlas, but it is still wrong :(

This commit is contained in:
Sanj 2012-01-11 04:32:38 +05:30
parent eae4d52bec
commit 895ab5de04
4 changed files with 22305 additions and 10129 deletions

Binary file not shown.

View File

@ -2,29 +2,15 @@ from settings import PROJECT_ROOT
from os.path import join from os.path import join
import json import json
import csv import csv
import pdb import pdb #debugger
from mumbai.models import * from mumbai.models import *
from fuzzywuzzy import process as fuzzprocess from fuzzywuzzy import process as fuzzprocess
import datetime import datetime
#Get levenshtein distance between two strings, from http://en.wikibooks.org/wiki/Algorithm_Implementation/Strings/Levenshtein_distance '''
def levenshtein(s1, s2): Convert Atlas.csv file (obtained from BEST) into first stage Atlas.json
if len(s1) < len(s2): (step 1)
return levenshtein(s2, s1) '''
if not s1:
return len(s2)
previous_row = xrange(len(s2) + 1)
for i, c1 in enumerate(s1):
current_row = [i + 1]
for j, c2 in enumerate(s2):
insertions = previous_row[j + 1] + 1 # j+1 instead of j since previous_row and current_row are one character longer
deletions = current_row[j] + 1 # than s2
substitutions = previous_row[j] + (c1 != c2)
current_row.append(min(insertions, deletions, substitutions))
previous_row = current_row
return previous_row[-1]
def csvToJSON(): def csvToJSON():
atlasCSV = csv.reader(open(join(PROJECT_ROOT, "../db_csv_files/Atlas.csv"), "r"), delimiter="\t") atlasCSV = csv.reader(open(join(PROJECT_ROOT, "../db_csv_files/Atlas.csv"), "r"), delimiter="\t")
atlasDict = {} atlasDict = {}
@ -44,6 +30,7 @@ def csvToJSON():
''' '''
function to copy over values of AM N PM + Schedule from previous row, reading from Atlas.json, writing to atlasCopied.json function to copy over values of AM N PM + Schedule from previous row, reading from Atlas.json, writing to atlasCopied.json
(fill in blank rows where 'copy from previous' is assumed, and create new json file - step 2)
''' '''
def processJSON(): def processJSON():
routeErrors = {'routes': [], 'others': []} routeErrors = {'routes': [], 'others': []}
@ -85,14 +72,18 @@ def processJSON():
''' '''
function to group atlasCopied.json to uniqueRoutes (uniqueRoutes.json) function to group atlasCopied.json to uniqueRoutes (uniqueRoutes.json)
(step 3)
''' '''
def groupUnique(): def groupUnique():
routes = json.loads(open(join(PROJECT_ROOT, "../db_csv_files/atlasCopied.json")).read()) routes = json.loads(open(join(PROJECT_ROOT, "../db_csv_files/atlasCopied.json")).read())
errors = {} errors = {}
outDict = {} outDict = {}
for key in routes.keys(): for key in routes.keys():
outDict[key] = [] outDict[key] = []
for row in routes[key]: for row in routes[key]:
i = 0
print key print key
d = { d = {
'from': row[7], 'from': row[7],
@ -105,47 +96,31 @@ def groupUnique():
} }
} }
matchedRow = isNotUnique(d, outDict[key]) matchedRow = isNotUnique(d, outDict[key])
schedule = row[-5]
if matchedRow: if matchedRow:
schedule = row[-5] outDict[key][i-1]['rows'][schedule] = row
outDict[key][matchedRow]['rows'][schedule] = row
else: else:
if isLargestSpan(d, routes[key]): if isLargestSpan(d, routes[key]):
d['is_full'] = True d['is_full'] = True
outDict[key].append(d) outDict[key].append(d)
outDict[key][i]['rows'][schedule] = row
i += 1
outFile = open(join(PROJECT_ROOT, "../db_csv_files/uniqueRoutes.json"), "w") outFile = open(join(PROJECT_ROOT, "../db_csv_files/uniqueRoutes.json"), "w")
outFile.write(json.dumps(outDict, indent=2)) outFile.write(json.dumps(outDict, indent=2))
outFile.close() outFile.close()
''' '''
Import RouteMaster Go through uniqueRoutes.json and actually import atlas data into the db
(step 4)
''' '''
def importRouteMaster():
CsvFile = csv.reader(open(join(PROJECT_ROOT, "../db_csv_files/RouteMaster.csv"), "r"), delimiter=',')
test = CsvFile.next()
stop_errors = []
print test
for row in CsvFile:
if len(row) < 1:
continue
from_to = getFromToStopsForRoute(row[0])
if from_to is None:
stop_errors.append(row[0])
continue
print row[0]
obj = Route(code=row[0], alias=row[1], from_stop_txt=row[2], to_stop_txt=row[3], from_stop=from_to[0], to_stop=from_to[1], distance=row[4], stages=int(row[5]))
obj.save()
errors = open(join(PROJECT_ROOT, "../errors/routeStopErrors.json"), "w")
errors.write(json.dumps(stop_errors, indent=2))
errors.close()
def importUniqueRoutes(): def importUniqueRoutes():
data = json.load(open(join(PROJECT_ROOT, "../db_csv_files/uniqueRoutes.json"))) data = json.load(open(join(PROJECT_ROOT, "../db_csv_files/uniqueRoutes.json")))
routeMapping = json.load(open(join(PROJECT_ROOT, "../db_csv_files/routeMapping.json"))) routeMapping = json.load(open(join(PROJECT_ROOT, "../db_csv_files/routeMapping.json")))
routeDoesNotExistErrors = [] routeDoesNotExistErrors = [] #route codes for which there are entries in routeMapping.json and in Atlas, but which do not exist in RouteMaster
stopMapping = {} stopMapping = {} #FIXME
stopErrors = [] stopErrors = [] #This should ideally never happen, and any errors here are bad and would indicate problems with the fuzzy matching logic, most likely.
for route in data.keys(): for route in data.keys():
routeCode = routeMapping[route] routeCode = routeMapping[route]
try: try:
@ -159,8 +134,8 @@ def importUniqueRoutes():
distance = float(thisRoute['span']) distance = float(thisRoute['span'])
except: except:
distance = 0 distance = 0
obj = UniqueRoute(route=routeObj, is_full=thisRoute['is_full'], distance=distance, from_stop_txt=thisRoute['from'], to_stop_txt=thisRoute['to']) obj = UniqueRoute(route=routeObj, is_full=thisRoute['is_full'], distance=distance, from_stop_txt=thisRoute['from'], to_stop_txt=thisRoute['to'])
if obj.is_full: if obj.is_full: #If the route is the primary route, we can get stop codes easily from RouteDetails first / last stop
from_to = getFromToStopsForRoute(routeObj.code) from_to = getFromToStopsForRoute(routeObj.code)
obj.from_stop = from_to[0] obj.from_stop = from_to[0]
if not stopMapping.has_key(obj.from_stop_txt): if not stopMapping.has_key(obj.from_stop_txt):
@ -168,7 +143,7 @@ def importUniqueRoutes():
obj.to_stop = from_to[1] obj.to_stop = from_to[1]
if not stopMapping.has_key(obj.to_stop_txt): if not stopMapping.has_key(obj.to_stop_txt):
stopMapping[obj.to_stop_txt] = from_to[1].stopcd stopMapping[obj.to_stop_txt] = from_to[1].stopcd
else: else: #Else we do fuzzy string matching against all possible values for stopname got from RouteDetails
stopnames = [] stopnames = []
stopcodes = [] stopcodes = []
for r in RouteDetails.objects.filter(rno=routeObj.code): for r in RouteDetails.objects.filter(rno=routeObj.code):
@ -183,10 +158,11 @@ def importUniqueRoutes():
except: except:
stopErrors.append([thisRoute['from'], thisRoute['to']]) stopErrors.append([thisRoute['from'], thisRoute['to']])
continue continue
obj.save() obj.save()
#pdb.set_trace() #pdb.set_trace()
# print thisRoute['rows'].keys() # print thisRoute['rows'].keys()
for schedule in thisRoute['rows'].keys(): for schedule in thisRoute['rows'].keys(): #loop through each schedule per UniqueRoute and save it
row = thisRoute['rows'][schedule] row = thisRoute['rows'][schedule]
try: try:
depot = Depot.objects.get(depot_code=row[6]) depot = Depot.objects.get(depot_code=row[6])
@ -195,6 +171,8 @@ def importUniqueRoutes():
#pdb.set_trace() #pdb.set_trace()
routeScheduleObj = RouteSchedule(unique_route=obj, schedule_type=schedule, busesAM=noneInt(row[2]), busesN=noneInt(row[3]), busesPM=noneInt(row[4]), bus_type=row[5], depot_txt=row[6], depot=depot, first_from=formatTime(row[8]), last_from=formatTime(row[9]), first_to=formatTime(row[11]), last_to=formatTime(row[12]), runtime1=noneInt(row[14]), runtime2=noneInt(row[15]), runtime3=noneInt(row[16]), runtime4=noneInt(row[17]), headway1=noneInt(row[18]), headway2=noneInt(row[19]), headway3=noneInt(row[20]), headway4=noneInt(row[21]), headway5=noneInt(row[22])) routeScheduleObj = RouteSchedule(unique_route=obj, schedule_type=schedule, busesAM=noneInt(row[2]), busesN=noneInt(row[3]), busesPM=noneInt(row[4]), bus_type=row[5], depot_txt=row[6], depot=depot, first_from=formatTime(row[8]), last_from=formatTime(row[9]), first_to=formatTime(row[11]), last_to=formatTime(row[12]), runtime1=noneInt(row[14]), runtime2=noneInt(row[15]), runtime3=noneInt(row[16]), runtime4=noneInt(row[17]), headway1=noneInt(row[18]), headway2=noneInt(row[19]), headway3=noneInt(row[20]), headway4=noneInt(row[21]), headway5=noneInt(row[22]))
routeScheduleObj.save() routeScheduleObj.save()
#done saving things - write out error files:
errors = open(join(PROJECT_ROOT, "../errors/routeMasterMissingRoutes.json"), "w") errors = open(join(PROJECT_ROOT, "../errors/routeMasterMissingRoutes.json"), "w")
errors.write(json.dumps(routeDoesNotExistErrors, indent=2)) errors.write(json.dumps(routeDoesNotExistErrors, indent=2))
errors.close() errors.close()
@ -227,12 +205,20 @@ def formatTime(s):
except: except:
return datetime.time(0,0) return datetime.time(0,0)
'''
Silly function to deal wth invalid strings in the data that need to go in as Integers into the db
passed a string, it will either return int(string) or None if that fails for any reason
FIXME: find a more elegant way to do this
'''
def noneInt(val): def noneInt(val):
try: try:
return int(val) return int(val)
except: except:
return None return None
'''
Passed a route code, it gets stop codes for the first and last stop
'''
def getFromToStopsForRoute(routeCode): def getFromToStopsForRoute(routeCode):
# fromStr = row[2] # fromStr = row[2]
routeDetails = RouteDetails.objects.filter(rno=routeCode).order_by('stopsr') routeDetails = RouteDetails.objects.filter(rno=routeCode).order_by('stopsr')
@ -243,7 +229,12 @@ def getFromToStopsForRoute(routeCode):
return (fromStop, toStop,) return (fromStop, toStop,)
'''
checks whether the row in a set of rows for a route has the largest 'span' value, useful to tell if a row belongs to a primary route
params:
data - dict with a span attribute
arr - array of rows to check if data['span'] is greater than. span is at row[13]
'''
def isLargestSpan(data, arr): def isLargestSpan(data, arr):
span = data['span'] span = data['span']
for a in arr: for a in arr:
@ -265,12 +256,15 @@ def isLargestSpan(data, arr):
def isNotUnique(data, arr): def isNotUnique(data, arr):
i = 0 i = 0
for a in arr: for a in arr:
if a['from'] == data['from'] and a['to'] == data['to'] and a['span'] == data['span']: if a['from'] == data['from'] and a['to'] == data['to']:
return i return i
i += 1 i += 1
return False return False
'''
Create routeMapping.json file to map route aliases to route codes
TODO: add mappings from hard coded routes
'''
def getRouteCodes(): def getRouteCodes():
atlasRawCSV = csv.reader(open(join(PROJECT_ROOT, "../db_csv_files/AtlasRaw.csv"), "r"), delimiter="\t") atlasRawCSV = csv.reader(open(join(PROJECT_ROOT, "../db_csv_files/AtlasRaw.csv"), "r"), delimiter="\t")
atlasDict = json.loads(open(join(PROJECT_ROOT, "../db_csv_files/Atlas.json")).read()) atlasDict = json.loads(open(join(PROJECT_ROOT, "../db_csv_files/Atlas.json")).read())
@ -290,6 +284,28 @@ def getRouteCodes():
mappingFile.close() mappingFile.close()
'''
Import RouteMaster into db
'''
def importRouteMaster():
CsvFile = csv.reader(open(join(PROJECT_ROOT, "../db_csv_files/RouteMaster.csv"), "r"), delimiter=',')
test = CsvFile.next()
stop_errors = []
print test
for row in CsvFile:
if len(row) < 1:
continue
from_to = getFromToStopsForRoute(row[0])
if from_to is None:
stop_errors.append(row[0])
continue
print row[0]
obj = Route(code=row[0], alias=row[1], from_stop_txt=row[2], to_stop_txt=row[3], from_stop=from_to[0], to_stop=from_to[1], distance=row[4], stages=int(row[5]))
obj.save()
errors = open(join(PROJECT_ROOT, "../errors/routeStopErrors.json"), "w")
errors.write(json.dumps(stop_errors, indent=2))
errors.close()
def csvClean1(): def csvClean1():
atlasCSV = csv.reader(open(join(PROJECT_ROOT, "../db_csv_files/Atlas.csv"), "r"), delimiter="\t") atlasCSV = csv.reader(open(join(PROJECT_ROOT, "../db_csv_files/Atlas.csv"), "r"), delimiter="\t")

View File

@ -2,6 +2,9 @@ from django.contrib import admin
from django import forms from django import forms
from mumbai.models import * from mumbai.models import *
class RouteScheduleInline(admin.StackedInline):
model = RouteSchedule
class AreaAdmin(admin.ModelAdmin): class AreaAdmin(admin.ModelAdmin):
list_display = ("a_code", "areanm") list_display = ("a_code", "areanm")
list_editable = ("areanm",) list_editable = ("areanm",)
@ -26,6 +29,9 @@ class FareAdmin(admin.ModelAdmin):
models.TextField: {'widget': forms.TextInput}, models.TextField: {'widget': forms.TextInput},
} }
class UniqueRouteAdmin(admin.ModelAdmin):
inlines = [RouteScheduleInline]
class StopForm(forms.ModelForm): class StopForm(forms.ModelForm):
@ -129,3 +135,5 @@ admin.site.register(Landmark, LandmarkAdmin )
admin.site.register(Depot,DepotAdmin) admin.site.register(Depot,DepotAdmin)
admin.site.register(Holiday,HolidayAdmin) admin.site.register(Holiday,HolidayAdmin)
admin.site.register(StopLocation,StopLocationAdmin) admin.site.register(StopLocation,StopLocationAdmin)
admin.site.register(UniqueRoute, UniqueRouteAdmin)

File diff suppressed because it is too large Load Diff