Skip to content

Commit

Permalink
Merge pull request #66 from arthur-schnitzler/144-try-out-httpsrecord…
Browse files Browse the repository at this point in the history
…linkagereadthedocsio

added mgm script to identifiy potential duplicated person/places #144
  • Loading branch information
csae8092 authored Feb 7, 2024
2 parents 126effd + 3ca5d40 commit b855f88
Show file tree
Hide file tree
Showing 4 changed files with 87 additions and 1 deletion.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -177,3 +177,5 @@ media/listbibl.xml
staticfiles/
hansi.csv
.docker
media/duplicated_*.csv
Untitled.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
import os
import pandas as pd
import recordlinkage

from typing import Any
from django.conf import settings
from django.core.management.base import BaseCommand

from apis_core.apis_entities.models import Person


class Command(BaseCommand):
help = "lists potential duplicated entities"

def handle(self, *args: Any, **options: Any) -> str | None:
print("searching for potential duplicates")

props = [
"id",
"name",
"first_name",
"start_date__year",
"end_date__year",
]
df = pd.DataFrame(
Person.objects.exclude(start_date__isnull=True).values_list(*props),
columns=props,
).astype("str")
df["custom_index"] = df["id"].astype(str) + " " + df["name"] + df["first_name"]
df.set_index("custom_index", inplace=True)
indexer = recordlinkage.Index()
indexer.block(["name"])
candidate_links = indexer.index(df)
len(candidate_links)
compare_cl = recordlinkage.Compare()
compare_cl.exact("first_name", "first_name", label="first_name")
compare_cl.exact(
"start_date__year", "start_date__year", label="start_date__year"
)
compare_cl.exact("end_date__year", "end_date__year", label="end_date__year")
features = compare_cl.compute(candidate_links, df)
matches = features[features.sum(axis=1) > 2]
save_path = os.path.join(settings.MEDIA_ROOT, "duplicated_persons.csv")
matches.to_csv(save_path)
print(f"found {len(matches)} potential duplicates")
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
import os
import pandas as pd
import recordlinkage

from typing import Any
from django.conf import settings
from django.core.management.base import BaseCommand

from apis_core.apis_entities.models import Place


class Command(BaseCommand):
help = "lists potential duplicated entities"

def handle(self, *args: Any, **options: Any) -> str | None:
print("searching for potential duplicates")

props = [
"id",
"name",
]
df = pd.DataFrame(
Place.objects.values_list(*props),
columns=props,
).astype("str")
df["custom_index"] = df["id"].astype(str) + " " + df["name"]
df.set_index("custom_index", inplace=True)
indexer = recordlinkage.Index()
indexer.block(["name"])
candidate_links = indexer.index(df)
len(candidate_links)
compare_cl = recordlinkage.Compare()
compare_cl.exact("name", "name", label="name")
features = compare_cl.compute(candidate_links, df)
matches = features[features.sum(axis=1) > 0]
save_path = os.path.join(settings.MEDIA_ROOT, "duplicated_places.csv")
matches.to_csv(save_path)
print(f"found {len(matches)} potential duplicates")
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -18,4 +18,5 @@ psycopg2
pyocclient==0.6
icecream
flake8
black
black
recordlinkage>0.15,<1

0 comments on commit b855f88

Please sign in to comment.