added comments to import_atlas, but it is still wrong :(

2012-01-11 04:32:38 +05:30 · 2012-01-11 04:32:38 +05:30 · 895ab5de04
commit 895ab5de04
parent eae4d52bec
4 changed files with 22305 additions and 10129 deletions
--- a/chaloBEST/chalobest.db
+++ b/chaloBEST/chalobest.db
--- a/chaloBEST/imports/import_atlas.py
+++ b/chaloBEST/imports/import_atlas.py
@ -2,29 +2,15 @@ from settings import PROJECT_ROOT
 from os.path import join
 import json
 import csv
-import pdb
+import pdb #debugger
 from mumbai.models import *
 from fuzzywuzzy import process as fuzzprocess
 import datetime
-#Get levenshtein distance between two strings, from http://en.wikibooks.org/wiki/Algorithm_Implementation/Strings/Levenshtein_distance
+'''
-def levenshtein(s1, s2):
+Convert Atlas.csv file (obtained from BEST) into first stage Atlas.json
-    if len(s1) < len(s2):
+(step 1)
-        return levenshtein(s2, s1)
+'''
    if not s1:
        return len(s2)
    previous_row = xrange(len(s2) + 1)
    for i, c1 in enumerate(s1):
        current_row = [i + 1]
        for j, c2 in enumerate(s2):
            insertions = previous_row[j + 1] + 1 # j+1 instead of j since previous_row and current_row are one character longer
            deletions = current_row[j] + 1       # than s2
            substitutions = previous_row[j] + (c1 != c2)
            current_row.append(min(insertions, deletions, substitutions))
        previous_row = current_row
    return previous_row[-1]
 def csvToJSON():
    atlasCSV = csv.reader(open(join(PROJECT_ROOT, "../db_csv_files/Atlas.csv"), "r"), delimiter="\t")
    atlasDict = {}
@ -44,6 +30,7 @@ def csvToJSON():
 '''
 function to copy over values of AM N PM + Schedule from previous row, reading from Atlas.json, writing to atlasCopied.json
 (fill in blank rows where 'copy from previous' is assumed, and create new json file - step 2)
 '''
 def processJSON():
    routeErrors = {'routes': [], 'others': []}
@ -85,14 +72,18 @@ def processJSON():
 '''
 function to group atlasCopied.json to uniqueRoutes (uniqueRoutes.json)
 (step 3)
 '''
 def groupUnique():
    routes = json.loads(open(join(PROJECT_ROOT, "../db_csv_files/atlasCopied.json")).read())
    errors = {}
    outDict = {}
    for key in routes.keys():
        outDict[key] = []
        for row in routes[key]:
            i = 0
            print key
            d = {
                'from': row[7],
@ -105,47 +96,31 @@ def groupUnique():
                }   
            }
            matchedRow = isNotUnique(d, outDict[key])
            schedule = row[-5]
            if matchedRow:
-                schedule = row[-5]
+                outDict[key][i-1]['rows'][schedule] = row
                outDict[key][matchedRow]['rows'][schedule] = row
            else:
                if isLargestSpan(d, routes[key]):
                    d['is_full'] = True
                outDict[key].append(d)
                outDict[key][i]['rows'][schedule] = row
                i += 1
    outFile = open(join(PROJECT_ROOT, "../db_csv_files/uniqueRoutes.json"), "w")
    outFile.write(json.dumps(outDict, indent=2))
    outFile.close()
 '''
-Import RouteMaster
+Go through uniqueRoutes.json and actually import atlas data into the db
 (step 4)
 '''
 def importRouteMaster():
    CsvFile = csv.reader(open(join(PROJECT_ROOT, "../db_csv_files/RouteMaster.csv"), "r"), delimiter=',')
    test = CsvFile.next()
    stop_errors = []
    print test
    for row in CsvFile:
        if len(row) < 1:
            continue
        from_to = getFromToStopsForRoute(row[0])
        if from_to is None:
            stop_errors.append(row[0])
            continue
        print row[0]
        obj = Route(code=row[0], alias=row[1], from_stop_txt=row[2], to_stop_txt=row[3], from_stop=from_to[0], to_stop=from_to[1], distance=row[4], stages=int(row[5]))
        obj.save()
    errors = open(join(PROJECT_ROOT, "../errors/routeStopErrors.json"), "w")
    errors.write(json.dumps(stop_errors, indent=2))
    errors.close()
 def importUniqueRoutes():
    data = json.load(open(join(PROJECT_ROOT, "../db_csv_files/uniqueRoutes.json")))
    routeMapping = json.load(open(join(PROJECT_ROOT, "../db_csv_files/routeMapping.json")))
-    routeDoesNotExistErrors = []
+    routeDoesNotExistErrors = [] #route codes for which there are entries in routeMapping.json and in Atlas, but which do not exist in RouteMaster
-    stopMapping = {}
+    stopMapping = {} #FIXME
-    stopErrors = []
+    stopErrors = [] #This should ideally never happen, and any errors here are bad and would indicate problems with the fuzzy matching logic, most likely.
    for route in data.keys():
        routeCode = routeMapping[route]
        try:
@ -159,8 +134,8 @@ def importUniqueRoutes():
                distance = float(thisRoute['span'])
            except:
                distance = 0
-            obj = UniqueRoute(route=routeObj, is_full=thisRoute['is_full'], distance=distance, from_stop_txt=thisRoute['from'], to_stop_txt=thisRoute['to'])
+            obj = UniqueRoute(route=routeObj, is_full=thisRoute['is_full'], distance=distance, from_stop_txt=thisRoute['from'], to_stop_txt=thisRoute['to']) 
-            if obj.is_full:
+            if obj.is_full: #If the route is the primary route, we can get stop codes easily from RouteDetails first / last stop
                from_to = getFromToStopsForRoute(routeObj.code)
                obj.from_stop = from_to[0]
                if not stopMapping.has_key(obj.from_stop_txt):
@ -168,7 +143,7 @@ def importUniqueRoutes():
                obj.to_stop = from_to[1]
                if not stopMapping.has_key(obj.to_stop_txt):
                    stopMapping[obj.to_stop_txt] = from_to[1].stopcd
-            else:
+            else: #Else we do fuzzy string matching against all possible values for stopname got from RouteDetails
                stopnames = []
                stopcodes = []
                for r in RouteDetails.objects.filter(rno=routeObj.code):
@ -183,10 +158,11 @@ def importUniqueRoutes():
                except:
                    stopErrors.append([thisRoute['from'], thisRoute['to']])
                    continue
            obj.save()
            #pdb.set_trace()
 #            print thisRoute['rows'].keys()
-            for schedule in thisRoute['rows'].keys(): 
+            for schedule in thisRoute['rows'].keys(): #loop through each schedule per UniqueRoute and save it
                row = thisRoute['rows'][schedule]
                try:
                    depot = Depot.objects.get(depot_code=row[6])
@ -195,6 +171,8 @@ def importUniqueRoutes():
                #pdb.set_trace()
                routeScheduleObj = RouteSchedule(unique_route=obj, schedule_type=schedule, busesAM=noneInt(row[2]), busesN=noneInt(row[3]), busesPM=noneInt(row[4]), bus_type=row[5], depot_txt=row[6], depot=depot, first_from=formatTime(row[8]), last_from=formatTime(row[9]), first_to=formatTime(row[11]), last_to=formatTime(row[12]), runtime1=noneInt(row[14]), runtime2=noneInt(row[15]), runtime3=noneInt(row[16]), runtime4=noneInt(row[17]), headway1=noneInt(row[18]), headway2=noneInt(row[19]), headway3=noneInt(row[20]), headway4=noneInt(row[21]), headway5=noneInt(row[22]))
                routeScheduleObj.save()
    #done saving things - write out error files:
    errors = open(join(PROJECT_ROOT, "../errors/routeMasterMissingRoutes.json"), "w")
    errors.write(json.dumps(routeDoesNotExistErrors, indent=2))
    errors.close()
@ -227,12 +205,20 @@ def formatTime(s):
    except:
        return datetime.time(0,0)
 '''
 Silly function to deal wth invalid strings in the data that need to go in as Integers into the db
 passed a string, it will either return int(string) or None if that fails for any reason
 FIXME: find a more elegant way to do this
 '''
 def noneInt(val):
    try:
        return int(val)
    except:
        return None
 '''
 Passed a route code, it gets stop codes for the first and last stop
 '''
 def getFromToStopsForRoute(routeCode):
 #    fromStr = row[2]
    routeDetails = RouteDetails.objects.filter(rno=routeCode).order_by('stopsr')
@ -243,7 +229,12 @@ def getFromToStopsForRoute(routeCode):
    return (fromStop, toStop,)
-
+'''
 checks whether the row in a set of rows for a route has the largest 'span' value, useful to tell if a row belongs to a primary route
 params:
  data - dict with a span attribute
  arr - array of rows to check if data['span'] is greater than. span is at row[13]
 '''
 def isLargestSpan(data, arr):
    span = data['span']
    for a in arr:
@ -265,12 +256,15 @@ def isLargestSpan(data, arr):
 def isNotUnique(data, arr):
    i = 0
    for a in arr:
-        if a['from'] == data['from'] and a['to'] == data['to'] and a['span'] == data['span']:
+        if a['from'] == data['from'] and a['to'] == data['to']:
            return i
        i += 1
    return False
-
+'''
 Create routeMapping.json file to map route aliases to route codes
 TODO: add mappings from hard coded routes
 '''
 def getRouteCodes():
    atlasRawCSV = csv.reader(open(join(PROJECT_ROOT, "../db_csv_files/AtlasRaw.csv"), "r"), delimiter="\t")
    atlasDict = json.loads(open(join(PROJECT_ROOT, "../db_csv_files/Atlas.json")).read())
@ -290,6 +284,28 @@ def getRouteCodes():
    mappingFile.close()
 '''
 Import RouteMaster into db
 '''
 def importRouteMaster():
    CsvFile = csv.reader(open(join(PROJECT_ROOT, "../db_csv_files/RouteMaster.csv"), "r"), delimiter=',')
    test = CsvFile.next()
    stop_errors = []
    print test
    for row in CsvFile:
        if len(row) < 1:
            continue
        from_to = getFromToStopsForRoute(row[0])
        if from_to is None:
            stop_errors.append(row[0])
            continue
        print row[0]
        obj = Route(code=row[0], alias=row[1], from_stop_txt=row[2], to_stop_txt=row[3], from_stop=from_to[0], to_stop=from_to[1], distance=row[4], stages=int(row[5]))
        obj.save()
    errors = open(join(PROJECT_ROOT, "../errors/routeStopErrors.json"), "w")
    errors.write(json.dumps(stop_errors, indent=2))
    errors.close()
 def csvClean1():
    atlasCSV = csv.reader(open(join(PROJECT_ROOT, "../db_csv_files/Atlas.csv"), "r"), delimiter="\t")
--- a/chaloBEST/mumbai/admin.py
+++ b/chaloBEST/mumbai/admin.py
@ -2,6 +2,9 @@ from django.contrib import admin
 from django import forms
 from mumbai.models import *
 class RouteScheduleInline(admin.StackedInline):
    model = RouteSchedule
 class AreaAdmin(admin.ModelAdmin):
    list_display = ("a_code", "areanm")
    list_editable = ("areanm",)
@ -26,6 +29,9 @@ class FareAdmin(admin.ModelAdmin):
        models.TextField: {'widget': forms.TextInput},
    }
 class UniqueRouteAdmin(admin.ModelAdmin):
    inlines = [RouteScheduleInline]
 class StopForm(forms.ModelForm):
@ -129,3 +135,5 @@ admin.site.register(Landmark, LandmarkAdmin )
 admin.site.register(Depot,DepotAdmin)
 admin.site.register(Holiday,HolidayAdmin)
 admin.site.register(StopLocation,StopLocationAdmin)
 admin.site.register(UniqueRoute, UniqueRouteAdmin)
--- a/db_csv_files/uniqueRoutes.json
+++ b/db_csv_files/uniqueRoutes.json