From 7ed00e598fdf653387056af310402884e5120bc0 Mon Sep 17 00:00:00 2001 From: ttwong Date: Fri, 20 Sep 2024 09:22:03 +0800 Subject: [PATCH 01/11] Add inital bus stop grouping feature --- crawling/groupBus.py | 93 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 93 insertions(+) create mode 100644 crawling/groupBus.py diff --git a/crawling/groupBus.py b/crawling/groupBus.py new file mode 100644 index 00000000..41560171 --- /dev/null +++ b/crawling/groupBus.py @@ -0,0 +1,93 @@ +from numpy import float64 +from scipy.spatial import KDTree +import json +import math +import polars as pl + +def haversine_distance(lat1, lon1, lat2, lon2): + R = 6371000 # Earth radius in meters + + phi1 = math.radians(lat1) + phi2 = math.radians(lat2) + delta_phi = math.radians(lat2 - lat1) + delta_lambda = math.radians(lon2 - lon1) + + a = math.sin(delta_phi/2)**2 + math.cos(phi1) * math.cos(phi2) * math.sin(delta_lambda/2)**2 + c = 2 * math.atan2(math.sqrt(a), math.sqrt(1-a)) + + return R * c + +def calculate_bearing(lat1, lon1, lat2, lon2): + phi1 = math.radians(lat1) + phi2 = math.radians(lat2) + delta_lambda = math.radians(lon2 - lon1) + + y = math.sin(delta_lambda) * math.cos(phi2) + x = math.cos(phi1) * math.sin(phi2) - math.sin(phi1) * math.cos(phi2) * math.cos(delta_lambda) + theta = math.atan2(y, x) + + return (math.degrees(theta) + 360) % 360 + +def group_bus_stops(bus_stops, max_distance=50, bearing_threshold=35): + tree = KDTree(bus_stops.select("lat", "lng")) + groups = pl.DataFrame(schema={"id":str, "lat":pl.Float64(), "lng":pl.Float64(), "name_en":str, "name_zh":str, "bus_group_id":pl.Int32()}) + visited = set() + group_id = 1 + + print(len(bus_stops)) + + for i in range(len(bus_stops)): + if i in visited: + continue + + # Create a new group for this stop + stop1 = bus_stops[i] + stop1 = stop1.with_columns( + bus_group_id=group_id + ) + group = stop1 + nearby_stop_indices = tree.query_ball_point([stop1['lat'][0], stop1['lng'][0]], r=max_distance/1000) + + for j in nearby_stop_indices: + if i != j and j not in visited: + stop2 = bus_stops[j] + + distance = haversine_distance(stop1['lat'][0], stop1['lng'][0], stop2['lat'][0], stop2['lng'][0]) + + if distance <= max_distance: + if group.height > 1: + prev_stop = group[-2] + bearing1 = calculate_bearing(prev_stop['lat'][0], prev_stop['lng'][0], stop1['lat'][0], stop1['lng'][0]) + bearing2 = calculate_bearing(stop1['lat'][0], stop1['lng'][0], stop2['lat'][0], stop2['lng'][0]) + + if abs(bearing1 - bearing2) <= bearing_threshold or abs(bearing1 - bearing2) >= 360 - bearing_threshold: + stop2 = stop2.with_columns( + bus_group_id=group_id + ) + group = group.vstack(stop2) + else: + stop2 = stop2.with_columns( + bus_group_id=group_id + ) + group = group.vstack(stop2) + + group_id += 1 + visited.add(i) + groups = groups.vstack(group) + + return groups + +if __name__ == '__main__': + with open("routeFareList.min.json", 'r', encoding='utf8') as f: + r = json.load(f) + r = r['stopList'] + + j2 = [{"id": id, "lat": v['location']['lat'], "lng": v['location']['lng'], + "name_en": v['name']['en'], "name_zh": v['name']['zh']} for id, v in r.items()] + + df = pl.from_dicts(j2) #.lazy() + #df = df.filter(pl.col('name_zh').str.contains('宋皇')) + grouped_bus_stops = group_bus_stops(df) + + with open(f'groupBus_all.json', 'w', encoding='utf8') as f: + f.write(grouped_bus_stops.write_json()) \ No newline at end of file From c0837f91552439144b22a3e56385d7edc74b0e60 Mon Sep 17 00:00:00 2001 From: ttwong Date: Fri, 20 Sep 2024 09:22:03 +0800 Subject: [PATCH 02/11] Add inital bus stop grouping feature --- crawling/groupBus.py | 93 +++++++++++++++++++++++++++++++++++++++ crawling/requirements.txt | 2 + 2 files changed, 95 insertions(+) create mode 100644 crawling/groupBus.py diff --git a/crawling/groupBus.py b/crawling/groupBus.py new file mode 100644 index 00000000..41560171 --- /dev/null +++ b/crawling/groupBus.py @@ -0,0 +1,93 @@ +from numpy import float64 +from scipy.spatial import KDTree +import json +import math +import polars as pl + +def haversine_distance(lat1, lon1, lat2, lon2): + R = 6371000 # Earth radius in meters + + phi1 = math.radians(lat1) + phi2 = math.radians(lat2) + delta_phi = math.radians(lat2 - lat1) + delta_lambda = math.radians(lon2 - lon1) + + a = math.sin(delta_phi/2)**2 + math.cos(phi1) * math.cos(phi2) * math.sin(delta_lambda/2)**2 + c = 2 * math.atan2(math.sqrt(a), math.sqrt(1-a)) + + return R * c + +def calculate_bearing(lat1, lon1, lat2, lon2): + phi1 = math.radians(lat1) + phi2 = math.radians(lat2) + delta_lambda = math.radians(lon2 - lon1) + + y = math.sin(delta_lambda) * math.cos(phi2) + x = math.cos(phi1) * math.sin(phi2) - math.sin(phi1) * math.cos(phi2) * math.cos(delta_lambda) + theta = math.atan2(y, x) + + return (math.degrees(theta) + 360) % 360 + +def group_bus_stops(bus_stops, max_distance=50, bearing_threshold=35): + tree = KDTree(bus_stops.select("lat", "lng")) + groups = pl.DataFrame(schema={"id":str, "lat":pl.Float64(), "lng":pl.Float64(), "name_en":str, "name_zh":str, "bus_group_id":pl.Int32()}) + visited = set() + group_id = 1 + + print(len(bus_stops)) + + for i in range(len(bus_stops)): + if i in visited: + continue + + # Create a new group for this stop + stop1 = bus_stops[i] + stop1 = stop1.with_columns( + bus_group_id=group_id + ) + group = stop1 + nearby_stop_indices = tree.query_ball_point([stop1['lat'][0], stop1['lng'][0]], r=max_distance/1000) + + for j in nearby_stop_indices: + if i != j and j not in visited: + stop2 = bus_stops[j] + + distance = haversine_distance(stop1['lat'][0], stop1['lng'][0], stop2['lat'][0], stop2['lng'][0]) + + if distance <= max_distance: + if group.height > 1: + prev_stop = group[-2] + bearing1 = calculate_bearing(prev_stop['lat'][0], prev_stop['lng'][0], stop1['lat'][0], stop1['lng'][0]) + bearing2 = calculate_bearing(stop1['lat'][0], stop1['lng'][0], stop2['lat'][0], stop2['lng'][0]) + + if abs(bearing1 - bearing2) <= bearing_threshold or abs(bearing1 - bearing2) >= 360 - bearing_threshold: + stop2 = stop2.with_columns( + bus_group_id=group_id + ) + group = group.vstack(stop2) + else: + stop2 = stop2.with_columns( + bus_group_id=group_id + ) + group = group.vstack(stop2) + + group_id += 1 + visited.add(i) + groups = groups.vstack(group) + + return groups + +if __name__ == '__main__': + with open("routeFareList.min.json", 'r', encoding='utf8') as f: + r = json.load(f) + r = r['stopList'] + + j2 = [{"id": id, "lat": v['location']['lat'], "lng": v['location']['lng'], + "name_en": v['name']['en'], "name_zh": v['name']['zh']} for id, v in r.items()] + + df = pl.from_dicts(j2) #.lazy() + #df = df.filter(pl.col('name_zh').str.contains('宋皇')) + grouped_bus_stops = group_bus_stops(df) + + with open(f'groupBus_all.json', 'w', encoding='utf8') as f: + f.write(grouped_bus_stops.write_json()) \ No newline at end of file diff --git a/crawling/requirements.txt b/crawling/requirements.txt index a76cdd79..ca6deb05 100644 --- a/crawling/requirements.txt +++ b/crawling/requirements.txt @@ -15,4 +15,6 @@ wheel==0.36.2 pyproj==3.3.0 httpx==0.25.2 xxhash==3.2.0 +polars==1.7.1 +scipy==1.24.0 -e . From 65812159a05768db4e15ee10f0fd31fed2c2d470 Mon Sep 17 00:00:00 2001 From: Raymond Tau Date: Sun, 22 Sep 2024 00:24:27 +0800 Subject: [PATCH 03/11] feat: upgrade gha python to 3.12 As new libraries used requires 3.9 or later --- .github/workflows/fetch-data.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/fetch-data.yml b/.github/workflows/fetch-data.yml index 7c0f19d2..02bb0c49 100644 --- a/.github/workflows/fetch-data.yml +++ b/.github/workflows/fetch-data.yml @@ -15,7 +15,7 @@ jobs: - name: Setup Python environment uses: actions/setup-python@v4 with: - python-version: '3.8' + python-version: '3.12' architecture: 'x64' cache: 'pip' cache-dependency-path: crawling/requirements.txt From 9f1945f35800dddd4cef7e1c63b6f8ea2d20f4ea Mon Sep 17 00:00:00 2001 From: Raymond Tau Date: Sun, 22 Sep 2024 00:34:57 +0800 Subject: [PATCH 04/11] fix: update dependency for newer Python version --- crawling/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crawling/requirements.txt b/crawling/requirements.txt index ca6deb05..aa5cb360 100644 --- a/crawling/requirements.txt +++ b/crawling/requirements.txt @@ -12,7 +12,7 @@ PySocks==1.7.1 six==1.15.0 urllib3==1.26.4 wheel==0.36.2 -pyproj==3.3.0 +pyproj==3.6.1 httpx==0.25.2 xxhash==3.2.0 polars==1.7.1 From 96cfe60c43b0b91445146e287a82a69d5ba8b99b Mon Sep 17 00:00:00 2001 From: Raymond Tau Date: Sun, 22 Sep 2024 00:35:12 +0800 Subject: [PATCH 05/11] fix: change library version to latest available --- crawling/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crawling/requirements.txt b/crawling/requirements.txt index aa5cb360..49d263c2 100644 --- a/crawling/requirements.txt +++ b/crawling/requirements.txt @@ -16,5 +16,5 @@ pyproj==3.6.1 httpx==0.25.2 xxhash==3.2.0 polars==1.7.1 -scipy==1.24.0 +scipy==1.14.1 -e . From 037aaa37bf87ce135841725e31d2f6b950f42df4 Mon Sep 17 00:00:00 2001 From: Raymond Tau Date: Sun, 22 Sep 2024 00:46:18 +0800 Subject: [PATCH 06/11] feat: include groupBus.py in crawling --- .github/workflows/fetch-data.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/fetch-data.yml b/.github/workflows/fetch-data.yml index 02bb0c49..bddb1396 100644 --- a/.github/workflows/fetch-data.yml +++ b/.github/workflows/fetch-data.yml @@ -45,6 +45,7 @@ jobs: python ./crawling/matchGtfs.py python ./crawling/cleansing.py python ./crawling/mergeRoutes.py + python ./crawling/groupBus.py python ./crawling/routeCompare.py python ./crawling/mtrExits.py From 5ba0acaac8e68ace0a94480f6e0a9ba7ca32b50f Mon Sep 17 00:00:00 2001 From: Raymond Tau Date: Sun, 22 Sep 2024 00:47:43 +0800 Subject: [PATCH 07/11] fix: remove duplicate copy in deployment --- .github/workflows/fetch-data.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/fetch-data.yml b/.github/workflows/fetch-data.yml index bddb1396..82d3d12e 100644 --- a/.github/workflows/fetch-data.yml +++ b/.github/workflows/fetch-data.yml @@ -68,12 +68,13 @@ jobs: ROUTE_BUS.xml route-ts/ exits.mtr.json + groupBus_all.json - name: Update MD5 run: md5sum routeFareList.min.json | cut -f1 -d ' ' | tr -d $'\n' > routeFareList.md5 - name: create deployment folder run: mkdir -p build - name: cp files into deployment folder - run: cp -r routeFareList.json routeFareList.min.json routeFareList.md5 CNAME exits.mtr.json route-ts build/ + run: cp -r routeFareList.json routeFareList.min.json routeFareList.md5 CNAME exits.mtr.json build/ - name: cp route-ts into deployment folder run: cp -r route-ts build - name: Update resources From da70905b16c7d77498ba716be13f506b74875524 Mon Sep 17 00:00:00 2001 From: Raymond Tau Date: Sun, 22 Sep 2024 00:49:03 +0800 Subject: [PATCH 08/11] chore: format cp command for easier reading --- .github/workflows/fetch-data.yml | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/.github/workflows/fetch-data.yml b/.github/workflows/fetch-data.yml index 82d3d12e..464ee76b 100644 --- a/.github/workflows/fetch-data.yml +++ b/.github/workflows/fetch-data.yml @@ -74,7 +74,14 @@ jobs: - name: create deployment folder run: mkdir -p build - name: cp files into deployment folder - run: cp -r routeFareList.json routeFareList.min.json routeFareList.md5 CNAME exits.mtr.json build/ + run: | + cp -r \ + routeFareList.json \ + routeFareList.min.json \ + routeFareList.md5 \ + CNAME \ + exits.mtr.json \ + build/ - name: cp route-ts into deployment folder run: cp -r route-ts build - name: Update resources From d2f6fe4dfe663cfa734512b761a0017f766c72a8 Mon Sep 17 00:00:00 2001 From: Raymond Tau Date: Sun, 22 Sep 2024 00:49:24 +0800 Subject: [PATCH 09/11] feat: include groupBus json in deployment --- .github/workflows/fetch-data.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/fetch-data.yml b/.github/workflows/fetch-data.yml index 464ee76b..963aa012 100644 --- a/.github/workflows/fetch-data.yml +++ b/.github/workflows/fetch-data.yml @@ -81,6 +81,7 @@ jobs: routeFareList.md5 \ CNAME \ exits.mtr.json \ + groupBus_all.json \ build/ - name: cp route-ts into deployment folder run: cp -r route-ts build From 365dd907b5502b6e33e68a04e63afa6621343386 Mon Sep 17 00:00:00 2001 From: Raymond Tau Date: Sun, 22 Sep 2024 18:14:41 +0800 Subject: [PATCH 10/11] fix: correct regex escape to mute SyntaxWarning in Python 3.12 --- crawling/parseGtfs.py | 2 +- crawling/parseGtfsEn.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/crawling/parseGtfs.py b/crawling/parseGtfs.py index b1a95384..f51ef312 100644 --- a/crawling/parseGtfs.py +++ b/crawling/parseGtfs.py @@ -97,7 +97,7 @@ async def parseGtfs(): _tmp.sort(key=takeFirst) routeList[route_id]['fares'][bound] = [v[0] for k,v in _tmp] - nameReg = re.compile('\[(.*)\] (.*)') + nameReg = re.compile(r'\[(.*)\] (.*)') def parseStopName(name): ret = {} for str in name.split('|'): diff --git a/crawling/parseGtfsEn.py b/crawling/parseGtfsEn.py index 42436557..c67d5ce3 100644 --- a/crawling/parseGtfsEn.py +++ b/crawling/parseGtfsEn.py @@ -97,7 +97,7 @@ async def parseGtfs(): _tmp.sort(key=takeFirst) routeList[route_id]['fares'][bound] = [v[0] for k,v in _tmp] - nameReg = re.compile('\[(.*)\] (.*)') + nameReg = re.compile(r'\[(.*)\] (.*)') def parseStopName(name): ret = {} for str in name.split('|'): From 3b39749bbf49da7106f84bf76ef4658fdc195c35 Mon Sep 17 00:00:00 2001 From: Raymond Tau Date: Sun, 22 Sep 2024 18:29:07 +0800 Subject: [PATCH 11/11] fix: grant content:write permission to GHA job So it can push changes to gh-pages branch, even if the repoo setting disallow write by default --- .github/workflows/fetch-data.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/fetch-data.yml b/.github/workflows/fetch-data.yml index 963aa012..fc50a490 100644 --- a/.github/workflows/fetch-data.yml +++ b/.github/workflows/fetch-data.yml @@ -7,6 +7,8 @@ on: jobs: Fetch-Route-Data: + permissions: + contents: write runs-on: ubuntu-latest steps: