Parse regex to get list of courts

nationalarchives · Dec 20, 2023 · df1fab1 · df1fab1
1 parent 9b03d1d
commit df1fab1
Show file tree

Hide file tree

Showing 2 changed files with 57 additions and 52 deletions.
diff --git a/src/ds_caselaw_utils/data/court_names.yaml b/src/ds_caselaw_utils/data/court_names.yaml
@@ -18,7 +18,7 @@
     - code: UKPC
       name: Privy Council
       link: https://www.jcpc.uk/
-      ncn: \[(\d{4})\] (UKPC) \d+
+      ncn: \[(\d{4})\] (UKPC) (\d+)
       param: "ukpc"
       start_year: 2014
       end_year: 2022

diff --git a/src/ds_caselaw_utils/generate_regex.py b/src/ds_caselaw_utils/generate_regex.py
@@ -2,72 +2,77 @@
 
 from courts import courts
 
-
-class ParsedURLPattern:
-    def __init__(self, regex):
-        self.regex = regex
+year_regex = r"\[(\d{4})\]"
+num_regex = r"(\d+)"
 
 
 class URLPatternParserError(RuntimeError):
     pass
 
 
-def url_order(pattern_string: str) -> list[str]:
-    builder = []
-    pattern = split_ncn(pattern_string)
-    for item in url_order_numbers(pattern_string):
-        builder.append(pattern[item - 1])
-    return builder
-
-
-def split_ncn(pattern_string: str) -> list[str]:
-    return pattern_string.split(" ")
-
-
-def url_order_numbers(pattern_string: str) -> list[int]:
-    pattern = split_ncn(pattern_string)
-    if pattern[0] != year_regex:
-        raise URLPatternParserError(
-            f"Pattern {pattern_string} does not start with year"
-        )
-
-    if num_regex not in pattern:
-        raise URLPatternParserError(
-            f"Pattern {pattern_string} does not contain judgment number"
-        )
-
-    "the sub court is in the late position the judgment number is not."
-    "Note: these are 1-indexed"
-
-    num_position = pattern.index(num_regex) + 1
-    if num_position == 4:
-        subcourt_position = 3
-    elif num_position == 3:
-        if len(pattern) == 4:
-            subcourt_position = 4
+class ParsedURLPattern:
+    def __init__(self, regex: str):
+        self.regex = regex
+        self.pattern = regex.split(" ")
+
+    @property
+    def court(self):
+        return self.url_order[0].strip("()")
+
+    @property
+    def subcourt(self):
+        if len(self.pattern) != 4:
+            return None
+        return self.url_order[1].strip(r"()\\")
+
+    @property
+    def url_order(self) -> list[str]:
+        return [self.pattern[item - 1] for item in self.url_order_numbers]
+
+    @property
+    def url_order_numbers(self) -> list[int]:
+        if self.pattern[0] != year_regex:
+            raise URLPatternParserError(
+                f"Pattern {self.regex} does not start with year"
+            )
+
+        if num_regex not in self.pattern:
+            raise URLPatternParserError(
+                f"Pattern {self.regex} does not contain judgment number"
+            )
+
+        "the sub court is in the late position the judgment number is not."
+        "Note: these are 1-indexed"
+
+        num_position = self.pattern.index(num_regex) + 1
+        if num_position == 4:
+            subcourt_position = 3
+        elif num_position == 3:
+            if len(self.pattern) == 4:
+                subcourt_position = 4
+            else:
+                subcourt_position = None
         else:
-            subcourt_position = None
-    else:
-        raise URLPatternParserError(
-            f"Did not expect judgment number at position {num_position}"
-        )
-
-    if subcourt_position:
-        return [2, subcourt_position, 1, num_position]
-    else:
-        return [2, 1, 3]
+            raise URLPatternParserError(
+                f"Did not expect judgment number at position {num_position}"
+            )
 
+        if subcourt_position:
+            return [2, subcourt_position, 1, num_position]
+        else:
+            return [2, 1, 3]
 
-year_regex = r"\[(\d{4})\]"
-num_regex = r"(\d+)"
 
 courtlist = courts.get_all()
 url_patterns = sorted(list(set(court.ncn for court in courtlist if court.ncn)))
 
 court_strings = set()
 subcourt_strings = set()
 for url_pattern in url_patterns:
-    print(url_pattern, url_order_numbers(url_pattern), url_order(url_pattern))
-    court_strings.add(url_pattern[0])
+    pattern = ParsedURLPattern(url_pattern)
+    print(pattern.regex, pattern.url_order_numbers, pattern.url_order)
+    court_strings.add(pattern.court)
+    if pattern.subcourt:
+        subcourt_strings.add(pattern.subcourt)
 
 print(court_strings, subcourt_strings)