Module d'import unitaire GTFS stops avec suppression + outils associés (#2851)

thbar · fchabouis · web-flow · commit 00ef863bbd63 · 2023-01-09T07:39:22.000-05:00
* Update MinIO template to work with homebrew-based install (I had troubles with Docker)

* Save first MinIO-based sync of latest ResourceHistory assets

So that I can work at scale locally.

* Add helper to stream just the stops

* Extract code to module and fix cache bug

* Save sequential processing (without delete of prior copies)

* Automatically delete gtfs tables when a given DataImport is deleted

* Add quick logs

* Save very WIP

* Save WIP before switching to another branch (some refactoring is needed)

* Add note

* Make get_file_stream mox'able

* Success while implementing one test around GtfsToDB import part

* Make sure to delete previous data_import for same resource_history_id

* Add more tests

* Make tests more reliable (order)

* Add bits of documentation

* Delete previous DataImport for the same resource

* Increase coverage

* Make credo happy

* Mix format

* Remove TODO

* Mix format &amp; simplification

* Mix format

* Make credo happy &amp; mix format

* Refactor code to proper modules &amp; rename accordingly

* Remove now legacy code (replaced by ImportStops)

* Rename for clarity

* Update gtfs_import_stops_test.exs

* Update apps/transport/test/transport/jobs/gtfs_import_stops_test.exs

Co-authored-by: Francis Chabouis &lt;fchabouis@gmail.com&gt;

* Improve minio template

* Update gtfs_import_stops.ex

* DRY code (careful - code isn't under test)

* Update gtfs_import_stops.ex

* Avoid join

* Format code

Co-authored-by: Francis Chabouis &lt;fchabouis@gmail.com&gt;
diff --git a/.miniorc.template b/.miniorc.template
@@ -4,14 +4,18 @@
 export MINIO_ROOT_USER=test-local
 export MINIO_ROOT_PASSWORD=apoi8761876bbazeriouy
 
-# 3. Follow instructions at
+# 3. It is preferable to use a non-Docker version when available,
+#    such as https://min.io/docs/minio/macos/index.html, in which case you can do:
+# minio server --console-address :9090 ~/data
+
+# 4. For Docker support (a bit more involved), follow instructions at
 # https://docs.min.io/minio/baremetal/quickstart/container.html#quickstart-container
 # which means at time of writing:
 # mkdir -p ~/minio/data
 # NOTE: -name removed for simplicity, and "quayio" removed since the container appeared outdated
 # docker run -p 9000:9000 -p 9090:9090 -v ~/minio/data:/data -e "MINIO_ROOT_USER=$MINIO_ROOT_USER" -e "MINIO_ROOT_PASSWORD=$MINIO_ROOT_PASSWORD" minio/minio server /data --console-address ":9090"
 
-# 4. setup `dev.secret.exs` from `dev.secret.template.exs`
-# 5. source `.miniorc`
-# 6. go to console at http://127.0.0.1:9090 and create bucket `transport-data-gouv-fr-resource-history-dev`
-# 7. start `mix phx.server`
+# 5. setup `dev.secret.exs` from `dev.secret.template.exs`
+# 6. source `.miniorc`
+# 7. go to console at http://127.0.0.1:9090 and create bucket `transport-data-gouv-fr-resource-history-dev`
+# 8. start `mix phx.server`
diff --git a/apps/transport/lib/S3/unzip.ex b/apps/transport/lib/S3/unzip.ex
@@ -1,4 +1,16 @@
+defmodule Transport.Unzip.S3.Behaviour do
+  @moduledoc """
+  Behaviour to allow partial Unzip testing.
+  """
+
+  @callback get_file_stream(binary(), binary(), binary()) :: Enumerable.t()
+end
+
 defmodule Transport.Unzip.S3 do
+  def impl, do: Application.get_env(:transport, :unzip_s3_impl, __MODULE__)
+
+  @behaviour Transport.Unzip.S3.Behaviour
+
   @moduledoc """
   Read a remote zip file stored on a S3 bucket, as explained here
   https://hexdocs.pm/unzip/readme.html
@@ -19,10 +31,7 @@ defmodule Transport.Unzip.S3 do
       )
 
   def get_file_stream(file_name, zip_name, bucket_name) do
-    aws_s3_config = aws_s3_config()
-    file = new(zip_name, bucket_name, aws_s3_config)
-    {:ok, unzip} = Unzip.new(file)
-
+    {:ok, unzip} = get_unzip(zip_name, bucket_name)
     Unzip.file_stream!(unzip, file_name)
   end
 
diff --git a/apps/transport/lib/jobs/gtfs_to_db.ex b/apps/transport/lib/jobs/gtfs_to_db.ex
@@ -13,10 +13,16 @@ defmodule Transport.Jobs.GtfsToDB do
     fill_trips_from_resource_history(resource_history_id, data_import_id)
   end
 
+  def import_gtfs_from_resource_history(resource_history_id, :stops) do
+    %{id: data_import_id} = %DB.DataImport{resource_history_id: resource_history_id} |> DB.Repo.insert!()
+    fill_stops_from_resource_history(resource_history_id, data_import_id)
+    data_import_id
+  end
+
   def file_stream(resource_history_id, gtfs_file_name) do
     %{payload: %{"filename" => filename}} = DB.ResourceHistory |> DB.Repo.get!(resource_history_id)
     bucket_name = Transport.S3.bucket_name(:history)
-    Transport.Unzip.S3.get_file_stream(gtfs_file_name, filename, bucket_name)
+    Transport.Unzip.S3.impl().get_file_stream(gtfs_file_name, filename, bucket_name)
   end
 
   def fill_stops_from_resource_history(resource_history_id, data_import_id) do
diff --git a/apps/transport/lib/transport/gtfs_import_stops.ex b/apps/transport/lib/transport/gtfs_import_stops.ex
@@ -0,0 +1,41 @@
+defmodule Transport.GTFSImportStops do
+  @moduledoc """
+  A module to import stops in a single `DB.DataImport` for a given resource, based on `resource_history_id`.
+  """
+
+  import Ecto.Query
+
+  @doc """
+  For the given `resource_history_id`, imports stops in a new `DB.DataImport`, then delete all related
+  pre-existing `DB.DataImport` (either with the same `resource_history_id`, or for the same resource).
+  """
+  def import_stops_and_remove_previous(resource_history_id) do
+    # Transaction timeout is at 15s currently, we may need to customize this here later
+    {:ok, data_import_id} =
+      DB.Repo.transaction(fn ->
+        data_import_id = Transport.Jobs.GtfsToDB.import_gtfs_from_resource_history(resource_history_id, :stops)
+
+        resource_id = DB.Repo.get_by(DB.ResourceHistory, id: resource_history_id).resource_id
+
+        query =
+          from(rh in DB.ResourceHistory,
+            where: rh.resource_id == ^resource_id and rh.id != ^resource_history_id,
+            select: rh.id
+          )
+
+        resource_history_ids = query |> DB.Repo.all()
+
+        # NOTE: we may need to add an index on di.resource_history_id
+        DB.DataImport
+        # delete all previous data imports for the same resource history id
+        |> where([di], di.resource_history_id == ^resource_history_id and di.id != ^data_import_id)
+        # delete all previous data imports for the same resource but different resource history ids
+        |> or_where([di], di.resource_history_id in ^resource_history_ids)
+        |> DB.Repo.delete_all()
+
+        data_import_id
+      end)
+
+    data_import_id
+  end
+end
diff --git a/apps/transport/priv/repo/migrations/20221206135302_data_import_delete.exs b/apps/transport/priv/repo/migrations/20221206135302_data_import_delete.exs
@@ -0,0 +1,24 @@
+defmodule DB.Repo.Migrations.DataImportDelete do
+  use Ecto.Migration
+
+  @gtfs_tables [
+    "gtfs_stops",
+    "gtfs_stop_times",
+    "gtfs_trips",
+    "gtfs_calendar",
+    "gtfs_calendar_dates"
+  ]
+
+  def up do
+    @gtfs_tables
+    |> Enum.each fn(tbl) ->
+      constraint_name = "#{tbl}_data_import_id_fkey" |> String.to_atom()
+      drop constraint(tbl, constraint_name)
+      alter table(tbl) do
+        modify :data_import_id,
+          references(:data_import, on_delete: :delete_all,
+          from: references(:data_import, on_delete: :nothing))
+      end
+    end
+  end
+end
diff --git a/apps/transport/test/support/mocks.ex b/apps/transport/test/support/mocks.ex
@@ -11,3 +11,4 @@ Mox.defmock(Transport.EmailSender.Mock, for: Transport.EmailSender)
 Mox.defmock(Hasher.Mock, for: Hasher.Wrapper)
 Mox.defmock(Transport.ValidatorsSelection.Mock, for: Transport.ValidatorsSelection)
 Mox.defmock(Transport.SIRIQueryGenerator.Mock, for: Transport.SIRIQueryGenerator.Behaviour)
+Mox.defmock(Transport.Unzip.S3.Mock, for: Transport.Unzip.S3.Behaviour)
diff --git a/apps/transport/test/transport/jobs/gtfs_import_stops_test.exs b/apps/transport/test/transport/jobs/gtfs_import_stops_test.exs
@@ -0,0 +1,79 @@
+defmodule Transport.Test.Transport.Jobs.GTFSImportStopsTest do
+  use ExUnit.Case, async: true
+  use Oban.Testing, repo: DB.Repo
+  import DB.Factory
+  import Mox
+  import Ecto.Query
+
+  setup :verify_on_exit!
+
+  setup do
+    :ok = Ecto.Adapters.SQL.Sandbox.checkout(DB.Repo)
+  end
+
+  def data_import_ids do
+    DB.Repo.all(from(di in DB.DataImport, select: di.id, order_by: [asc: di.id]))
+  end
+
+  def setup_mox(zip_filename) do
+    # NOTE: it will be possible to reuse common code from Transport.Unzip.S3 in there
+    Transport.Unzip.S3.Mock
+    |> expect(:get_file_stream, fn file_in_zip, zip_file, bucket ->
+      # from payload
+      assert zip_file == zip_filename
+      # from config
+      assert bucket == "transport-data-gouv-fr-resource-history-test"
+
+      # stub with a local file
+      path = "#{__DIR__}/../../fixture/files/gtfs_import.zip"
+      zip_file = Unzip.LocalFile.open(path)
+      {:ok, unzip} = Unzip.new(zip_file)
+      Unzip.file_stream!(unzip, file_in_zip)
+    end)
+  end
+
+  test "import stops" do
+    %{id: dataset_id} = insert(:dataset, %{datagouv_id: "xxx", datagouv_title: "coucou"})
+    %{id: resource_id} = insert(:resource, dataset_id: dataset_id)
+
+    %{id: resource_history_id} =
+      insert(:resource_history, %{resource_id: resource_id, payload: %{"filename" => "some-file.zip"}})
+
+    setup_mox("some-file.zip")
+    assert data_import_ids() == []
+    first_data_import_id = Transport.GTFSImportStops.import_stops_and_remove_previous(resource_history_id)
+    assert data_import_ids() == [first_data_import_id]
+
+    # subsequent import must remove the previous import for same resource_history_id
+    setup_mox("some-file.zip")
+    second_data_import_id = Transport.GTFSImportStops.import_stops_and_remove_previous(resource_history_id)
+    assert data_import_ids() == [second_data_import_id]
+
+    # subsequent import for a new resource_history_id on same resource should also remove previous imports
+    %{id: new_resource_history_id} =
+      insert(:resource_history, %{resource_id: resource_id, payload: %{"filename" => "some-new-file.zip"}})
+
+    setup_mox("some-new-file.zip")
+    third_data_import_id = Transport.GTFSImportStops.import_stops_and_remove_previous(new_resource_history_id)
+    assert data_import_ids() == [third_data_import_id]
+
+    # other resources should not be impacted by import
+    setup_mox("some-other-file.zip")
+    %{id: other_dataset_id} = insert(:dataset, %{datagouv_id: "yyy"})
+    %{id: other_resource_id} = insert(:resource, dataset_id: other_dataset_id)
+
+    %{id: other_resource_history_id} =
+      insert(:resource_history, %{resource_id: other_resource_id, payload: %{"filename" => "some-other-file.zip"}})
+
+    other_data_import_id = Transport.GTFSImportStops.import_stops_and_remove_previous(other_resource_history_id)
+
+    assert data_import_ids() == [third_data_import_id, other_data_import_id]
+
+    %{id: new_resource_history_id} =
+      insert(:resource_history, %{resource_id: resource_id, payload: %{"filename" => "some-new-file.zip"}})
+
+    setup_mox("some-new-file.zip")
+    fourth_data_import_id = Transport.GTFSImportStops.import_stops_and_remove_previous(new_resource_history_id)
+    assert data_import_ids() == [other_data_import_id, fourth_data_import_id]
+  end
+end
diff --git a/config/test.exs b/config/test.exs
@@ -41,6 +41,7 @@ config :transport,
   validator_selection: Transport.ValidatorsSelection.Mock,
   data_visualization: Transport.DataVisualization.Mock,
   notifications_api_token: "secret",
+  unzip_s3_impl: Transport.Unzip.S3.Mock,
   s3_buckets: %{
     history: "resource-history-test",
     on_demand_validation: "on-demand-validation-test",
diff --git a/scripts/download_resource_history_files.exs b/scripts/download_resource_history_files.exs
@@ -0,0 +1,58 @@
+# NOTE: temporary script to help work locally with large amounts of GTFS.
+require Logger
+
+Logger.info("Starting...")
+
+import Ecto.Query
+
+# Useful notes to create the global import job later:
+# - we could look at "last up to date" to ensure we only have fresh data in database
+# - use gtfs_to_db once downloaded
+# - look at Transport.Jobs.BNLCToGeoData and Transport.Jobs.BaseGeoData.import_replace_data
+
+defmodule Tooling do
+  # for each active dataset, grab all resources with a resource history.
+  def query() do
+    DB.Dataset.base_query()
+    |> DB.Resource.join_dataset_with_resource()
+    |> DB.ResourceHistory.join_resource_with_latest_resource_history()
+    |> where([resource: r], r.format == "GTFS")
+    |> select([resource_history: rh], rh)
+  end
+end
+
+defmodule SyncS3LatestResourceHistory do
+  def sync!(minio_folder) do
+    Tooling.query()
+    |> DB.Repo.all()
+    # TODO: task sync max concurrency (this is slow)
+    |> Enum.each(fn rh ->
+      # create local minio bucket, if needed
+      bucket_name = Transport.S3.bucket_name(:history)
+      base_path = Path.join(minio_folder, bucket_name)
+      unless File.exists?(base_path), do: File.mkdir_p!(base_path)
+
+      # simple check based on file presence on disk
+      file_path = Path.join(base_path, rh.payload["filename"])
+
+      # TODO: replace by head request
+      if File.exists?(file_path) do
+        Logger.info("File already downloaded, skipping...")
+      else
+        Logger.info("Downloading file...")
+        Logger.info(file_path)
+
+        %HTTPoison.Response{status_code: 200, body: body} =
+          Transport.Shared.Wrapper.HTTPoison.impl().get!(rh.payload["permanent_url"])
+
+        Transport.S3.upload_to_s3!(:history, body, rh.payload["filename"])
+      end
+    end)
+  end
+end
+
+Logger.configure(level: :info)
+
+# # create a local S3 copy (via MinIO) of each latest ResourceHistory file so
+# # that the production database dump can be used locally with a matching local S3 file
+SyncS3LatestResourceHistory.sync!(Path.expand("~/data"))