Skip to content

Commit b9b2fa9

Browse files
Crée et importe des réutilisations de data.gouv.fr (#4478)
1 parent d6ef89e commit b9b2fa9

File tree

7 files changed

+317
-0
lines changed

7 files changed

+317
-0
lines changed

apps/transport/lib/db/reuse.ex

+125
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,125 @@
1+
defmodule DB.Reuse do
2+
@moduledoc """
3+
Represents data.gouv.fr reuses.
4+
"""
5+
use TypedEctoSchema
6+
use Ecto.Schema
7+
import Ecto.Changeset
8+
import Ecto.Query
9+
10+
typed_schema "reuse" do
11+
field(:datagouv_id, :string)
12+
field(:title, :string)
13+
field(:slug, :string)
14+
field(:url, :string)
15+
field(:type, :string)
16+
field(:description, :string)
17+
field(:remote_url, :string)
18+
field(:organization, :string)
19+
field(:organization_id, :string)
20+
field(:owner, :string)
21+
field(:owner_id, :string)
22+
field(:image, :string)
23+
field(:featured, :boolean)
24+
field(:archived, :boolean)
25+
field(:topic, :string)
26+
field(:tags, {:array, :string})
27+
field(:metric_discussions, :integer)
28+
field(:metric_datasets, :integer)
29+
field(:metric_followers, :integer)
30+
field(:metric_views, :integer)
31+
field(:created_at, :utc_datetime_usec)
32+
field(:last_modified, :utc_datetime_usec)
33+
34+
many_to_many(:datasets, DB.Dataset, join_through: "reuse_dataset", on_replace: :delete)
35+
end
36+
37+
def changeset(model, attrs) do
38+
model
39+
|> cast(attrs, [
40+
:title,
41+
:slug,
42+
:url,
43+
:type,
44+
:description,
45+
:remote_url,
46+
:organization,
47+
:organization_id,
48+
:owner,
49+
:owner_id,
50+
:image,
51+
:topic,
52+
:created_at,
53+
:last_modified
54+
])
55+
|> transform_datagouv_id(attrs)
56+
|> transform_metric_keys(attrs)
57+
|> transform_archived(attrs)
58+
|> transform_featured(attrs)
59+
|> transform_tags(attrs)
60+
|> validate_required([
61+
:datagouv_id,
62+
:title,
63+
:slug,
64+
:url,
65+
:type,
66+
:description,
67+
:remote_url,
68+
:featured,
69+
:archived,
70+
:topic,
71+
:tags,
72+
:metric_discussions,
73+
:metric_datasets,
74+
:metric_followers,
75+
:metric_views,
76+
:created_at,
77+
:last_modified
78+
])
79+
|> cast_datasets(attrs)
80+
end
81+
82+
defp cast_datasets(%Ecto.Changeset{} = changeset, %{"datasets" => datasets}) do
83+
datagouv_ids = (datasets || "") |> String.split(",")
84+
85+
datasets =
86+
DB.Dataset.base_query()
87+
|> where([dataset: d], d.datagouv_id in ^datagouv_ids)
88+
|> select([dataset: d], [:id])
89+
|> DB.Repo.all()
90+
91+
changeset |> put_assoc(:datasets, datasets)
92+
end
93+
94+
defp transform_archived(%Ecto.Changeset{} = changeset, params) do
95+
transform_bool(changeset, :archived, params)
96+
end
97+
98+
defp transform_featured(%Ecto.Changeset{} = changeset, params) do
99+
transform_bool(changeset, :featured, params)
100+
end
101+
102+
def transform_bool(%Ecto.Changeset{} = changeset, key, params) do
103+
case Map.get(params, to_string(key)) do
104+
value when is_binary(value) -> put_change(changeset, key, String.downcase(value) == "true")
105+
value -> put_change(changeset, key, value)
106+
end
107+
end
108+
109+
defp transform_tags(%Ecto.Changeset{} = changeset, %{"tags" => tags}) do
110+
put_change(changeset, :tags, String.split(tags || "", ","))
111+
end
112+
113+
defp transform_datagouv_id(%Ecto.Changeset{} = changeset, %{"id" => id}) do
114+
changeset |> put_change(:datagouv_id, id) |> delete_change(:id)
115+
end
116+
117+
defp transform_metric_keys(%Ecto.Changeset{} = changeset, attributes) do
118+
attributes
119+
|> Enum.filter(fn {k, _v} -> String.starts_with?(k, "metric.") end)
120+
|> Enum.map(fn {k, v} ->
121+
{k |> String.replace(".", "_") |> String.to_existing_atom(), String.to_integer(v)}
122+
end)
123+
|> Enum.reduce(changeset, fn {k, v}, changeset -> put_change(changeset, k, v) end)
124+
end
125+
end
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
defmodule Transport.Jobs.ImportReusesJob do
2+
@moduledoc """
3+
Import reuses from data.gouv.fr when it uses at least a dataset referenced
4+
on our platform.
5+
6+
See https://www.data.gouv.fr/fr/datasets/catalogue-des-donnees-de-data-gouv-fr/
7+
"""
8+
use Oban.Worker, max_attempts: 3
9+
import Ecto.Query
10+
11+
# reuses.csv export
12+
@csv_url "https://www.data.gouv.fr/fr/datasets/r/970aafa0-3778-4d8b-b9d1-de937525e379"
13+
14+
@impl Oban.Worker
15+
def perform(%Oban.Job{}) do
16+
DB.Repo.transaction(fn ->
17+
truncate_reuses()
18+
import_all_reuses()
19+
end)
20+
21+
:ok
22+
end
23+
24+
defp truncate_reuses, do: DB.Repo.delete_all(DB.Reuse)
25+
26+
defp import_all_reuses do
27+
datagouv_ids = dataset_datagouv_ids()
28+
%{status: 200, body: body} = http_client().get!(@csv_url, decode_body: false)
29+
30+
[body]
31+
|> CSV.decode!(headers: true, separator: ?;, escape_max_lines: 1_000)
32+
|> Stream.reject(fn %{"datasets" => datasets} = attributes ->
33+
empty_optional_fields?(attributes) or orphan_reuse?(datasets, datagouv_ids)
34+
end)
35+
|> Enum.each(fn record ->
36+
%DB.Reuse{} |> DB.Reuse.changeset(record) |> DB.Repo.insert!()
37+
end)
38+
end
39+
40+
defp orphan_reuse?(datasets, datagouv_ids) do
41+
datasets |> String.split(",") |> MapSet.new() |> MapSet.disjoint?(datagouv_ids)
42+
end
43+
44+
defp dataset_datagouv_ids do
45+
DB.Dataset.base_query()
46+
|> select([dataset: d], d.datagouv_id)
47+
|> DB.Repo.all()
48+
|> MapSet.new()
49+
end
50+
51+
defp empty_optional_fields?(attributes) do
52+
attributes
53+
|> Map.take(["remote_url", "description", "datasets"])
54+
|> Map.values()
55+
|> Enum.all?(&(&1 == ""))
56+
end
57+
58+
defp http_client, do: Transport.Req.impl()
59+
end
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
defmodule DB.Repo.Migrations.CreateReuse do
2+
use Ecto.Migration
3+
4+
def change do
5+
create table(:reuse) do
6+
add(:datagouv_id, :string)
7+
add(:title, :string)
8+
add(:slug, :string)
9+
add(:url, :string, size: 500)
10+
add(:type, :string)
11+
add(:description, :text)
12+
add(:remote_url, :string, size: 1_000)
13+
add(:organization, :string)
14+
add(:organization_id, :string)
15+
add(:owner, :string)
16+
add(:owner_id, :string)
17+
add(:image, :string)
18+
add(:featured, :boolean, default: false)
19+
add(:archived, :boolean, default: false)
20+
add(:topic, :string)
21+
add(:tags, {:array, :string})
22+
add(:metric_discussions, :integer)
23+
add(:metric_datasets, :integer)
24+
add(:metric_followers, :integer)
25+
add(:metric_views, :integer)
26+
add(:created_at, :utc_datetime_usec)
27+
add(:last_modified, :utc_datetime_usec)
28+
end
29+
end
30+
end
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
defmodule DB.Repo.Migrations.CreateReuseDataset do
2+
use Ecto.Migration
3+
4+
def change do
5+
create table(:reuse_dataset, primary_key: false) do
6+
add(:reuse_id, references(:reuse, on_delete: :delete_all))
7+
add(:dataset_id, references(:dataset, on_delete: :delete_all))
8+
end
9+
end
10+
end

apps/transport/test/db/reuse_test.exs

+33
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
defmodule DB.ReuseTest do
2+
use ExUnit.Case, async: true
3+
import DB.Factory
4+
5+
setup do
6+
Ecto.Adapters.SQL.Sandbox.checkout(DB.Repo)
7+
end
8+
9+
test "changeset" do
10+
%DB.Dataset{id: dataset_id} = insert(:dataset, datagouv_id: datagouv_id = "53699569a3a729239d2046eb")
11+
12+
# Payload from https://tabular-api.data.gouv.fr/api/resources/970aafa0-3778-4d8b-b9d1-de937525e379/data/?page=1&page_size=50&topic__exact=transport_and_mobility
13+
data =
14+
~s|{"id": "67c02dfe7172569a69c367e6","title": "Carte nationale des plateaux techniques spécialisés (PTS) pour « évaluer l’aptitude médicale à la conduite » ","slug": "carte-nationale-des-plateaux-techniques-specialises-pts-pour-evaluer-laptitude-medicale-a-la-conduite","url": "http://www.data.gouv.fr/fr/reuses/carte-nationale-des-plateaux-techniques-specialises-pts-pour-evaluer-laptitude-medicale-a-la-conduite/","type": "visualization","description": "Ceci est une description","remote_url": "https://www.securite-routiere.gouv.fr/permis-et-situation-de-handicap/carte-des-plateaux-techniques-de-sante","organization": null,"organization_id": null,"owner": "ilyes-zeroual","owner_id": "67c0216ab1f98413870cc70c","image": "https://static.data.gouv.fr/images/69/f0053e284741c9b6d2a73cc490edb2-500.png","featured": "False","created_at": "2025-02-27T09:18:54.658000","last_modified": "2025-02-27T09:49:33.676000","archived": "False","topic": "transport_and_mobility","tags": "foo,bar","datasets": "54730e00c751df4f2ec2acbe,#{datagouv_id}","metric.discussions": "0","metric.datasets": "2","metric.followers": "0","metric.views": "234"}|
15+
16+
assert %Ecto.Changeset{
17+
valid?: true,
18+
changes: %{
19+
datagouv_id: "67c02dfe7172569a69c367e6",
20+
tags: ["foo", "bar"],
21+
metric_views: 234,
22+
archived: false,
23+
featured: false,
24+
created_at: ~U[2025-02-27 09:18:54.658000Z]
25+
}
26+
} = changeset = DB.Reuse.changeset(%DB.Reuse{}, Jason.decode!(data))
27+
28+
DB.Repo.insert!(changeset)
29+
30+
[reuse] = DB.Repo.all(DB.Reuse)
31+
assert [%DB.Dataset{id: ^dataset_id}] = reuse |> DB.Repo.preload(:datasets) |> Map.fetch!(:datasets)
32+
end
33+
end
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
defmodule Transport.Test.Transport.Jobs.ImportReusesJobTest do
2+
use ExUnit.Case, async: true
3+
import DB.Factory
4+
import Mox
5+
use Oban.Testing, repo: DB.Repo
6+
7+
setup :verify_on_exit!
8+
9+
setup do
10+
Ecto.Adapters.SQL.Sandbox.checkout(DB.Repo)
11+
end
12+
13+
@csv_url "https://www.data.gouv.fr/fr/datasets/r/970aafa0-3778-4d8b-b9d1-de937525e379"
14+
@dataset_datagouv_id Ecto.UUID.generate()
15+
16+
test "perform" do
17+
%DB.Dataset{id: dataset_id} = insert(:dataset, datagouv_id: @dataset_datagouv_id)
18+
19+
# An existing reuse will be deleted when importing all reuses
20+
DB.Reuse.changeset(%DB.Reuse{}, sample_reuse(Ecto.UUID.generate()))
21+
|> DB.Repo.insert!()
22+
23+
assert 1 == DB.Repo.all(DB.Reuse) |> Enum.count()
24+
25+
datagouv_id_1 = Ecto.UUID.generate()
26+
datagouv_id_2 = Ecto.UUID.generate()
27+
28+
setup_csv_response([datagouv_id_1, datagouv_id_2])
29+
30+
assert :ok == perform_job(Transport.Jobs.ImportReusesJob, %{})
31+
32+
# We now have 2 reuses and they are associated with a dataset
33+
# The orphan reuse (not referencing an existing dataset) has been deleted.
34+
reuses = DB.Repo.all(DB.Reuse)
35+
assert MapSet.new([datagouv_id_1, datagouv_id_2]) == reuses |> Enum.map(& &1.datagouv_id) |> MapSet.new()
36+
assert %DB.Reuse{datasets: [%DB.Dataset{id: ^dataset_id}]} = reuses |> hd() |> DB.Repo.preload(:datasets)
37+
end
38+
39+
defp sample_reuse(datagouv_id, datasets \\ [@dataset_datagouv_id]) do
40+
~s|{"title": "Carte nationale des plateaux techniques spécialisés (PTS) pour « évaluer l’aptitude médicale à la conduite » ","slug": "carte-nationale-des-plateaux-techniques-specialises-pts-pour-evaluer-laptitude-medicale-a-la-conduite","url": "http://www.data.gouv.fr/fr/reuses/carte-nationale-des-plateaux-techniques-specialises-pts-pour-evaluer-laptitude-medicale-a-la-conduite/","type": "visualization","description": "Ceci est une description","remote_url": "https://www.securite-routiere.gouv.fr/permis-et-situation-de-handicap/carte-des-plateaux-techniques-de-sante","organization": null,"organization_id": null,"owner": "ilyes-zeroual","owner_id": "67c0216ab1f98413870cc70c","image": "https://static.data.gouv.fr/images/69/f0053e284741c9b6d2a73cc490edb2-500.png","featured": "False","created_at": "2025-02-27T09:18:54.658000","last_modified": "2025-02-27T09:49:33.676000","archived": "False","topic": "transport_and_mobility","tags": "foo,bar","metric.discussions": "0","metric.datasets": "2","metric.followers": "0","metric.views": "234"}|
41+
|> Jason.decode!()
42+
|> Map.put("id", datagouv_id)
43+
|> Map.put("datasets", Enum.join(datasets, ","))
44+
end
45+
46+
defp setup_csv_response(datagouv_ids) do
47+
url = @csv_url
48+
orphan_reuse = sample_reuse(Ecto.UUID.generate(), [Ecto.UUID.generate()])
49+
50+
body =
51+
(Enum.map(datagouv_ids, &sample_reuse/1) ++ [orphan_reuse])
52+
|> CSV.encode(headers: true, separator: ?;)
53+
|> Enum.join("")
54+
55+
expect(Transport.Req.Mock, :get!, fn ^url, [decode_body: false] ->
56+
%Req.Response{status: 200, body: body}
57+
end)
58+
end
59+
end

config/runtime.exs

+1
Original file line numberDiff line numberDiff line change
@@ -161,6 +161,7 @@ oban_prod_crontab = [
161161
{"20 5 * * *", Transport.Jobs.ImportDatasetContactPointsJob},
162162
# Should be ideally executed after `GBFSMultiValidationDispatcherJob` to use fresh metadata
163163
{"30 8 * * *", Transport.Jobs.ImportGBFSFeedContactEmailJob},
164+
{"20 5 * * *", Transport.Jobs.ImportReusesJob},
164165
{"30 5 * * *", Transport.Jobs.ImportDatasetMonthlyMetricsJob},
165166
{"45 5 * * *", Transport.Jobs.ImportResourceMonthlyMetricsJob},
166167
{"0 8 * * *", Transport.Jobs.WarnUserInactivityJob},

0 commit comments

Comments
 (0)