Skip to content

Commit

Permalink
Parse regex to get list of courts
Browse files Browse the repository at this point in the history
  • Loading branch information
dragon-dxw committed Dec 20, 2023
1 parent 9b03d1d commit df1fab1
Show file tree
Hide file tree
Showing 2 changed files with 57 additions and 52 deletions.
2 changes: 1 addition & 1 deletion src/ds_caselaw_utils/data/court_names.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
- code: UKPC
name: Privy Council
link: https://www.jcpc.uk/
ncn: \[(\d{4})\] (UKPC) \d+
ncn: \[(\d{4})\] (UKPC) (\d+)
param: "ukpc"
start_year: 2014
end_year: 2022
Expand Down
107 changes: 56 additions & 51 deletions src/ds_caselaw_utils/generate_regex.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,72 +2,77 @@

from courts import courts


class ParsedURLPattern:
def __init__(self, regex):
self.regex = regex
year_regex = r"\[(\d{4})\]"
num_regex = r"(\d+)"


class URLPatternParserError(RuntimeError):
pass


def url_order(pattern_string: str) -> list[str]:
builder = []
pattern = split_ncn(pattern_string)
for item in url_order_numbers(pattern_string):
builder.append(pattern[item - 1])
return builder


def split_ncn(pattern_string: str) -> list[str]:
return pattern_string.split(" ")


def url_order_numbers(pattern_string: str) -> list[int]:
pattern = split_ncn(pattern_string)
if pattern[0] != year_regex:
raise URLPatternParserError(
f"Pattern {pattern_string} does not start with year"
)

if num_regex not in pattern:
raise URLPatternParserError(
f"Pattern {pattern_string} does not contain judgment number"
)

"the sub court is in the late position the judgment number is not."
"Note: these are 1-indexed"

num_position = pattern.index(num_regex) + 1
if num_position == 4:
subcourt_position = 3
elif num_position == 3:
if len(pattern) == 4:
subcourt_position = 4
class ParsedURLPattern:
def __init__(self, regex: str):
self.regex = regex
self.pattern = regex.split(" ")

@property
def court(self):
return self.url_order[0].strip("()")

@property
def subcourt(self):
if len(self.pattern) != 4:
return None
return self.url_order[1].strip(r"()\\")

@property
def url_order(self) -> list[str]:
return [self.pattern[item - 1] for item in self.url_order_numbers]

@property
def url_order_numbers(self) -> list[int]:
if self.pattern[0] != year_regex:
raise URLPatternParserError(
f"Pattern {self.regex} does not start with year"
)

if num_regex not in self.pattern:
raise URLPatternParserError(
f"Pattern {self.regex} does not contain judgment number"
)

"the sub court is in the late position the judgment number is not."
"Note: these are 1-indexed"

num_position = self.pattern.index(num_regex) + 1
if num_position == 4:
subcourt_position = 3
elif num_position == 3:
if len(self.pattern) == 4:
subcourt_position = 4
else:
subcourt_position = None
else:
subcourt_position = None
else:
raise URLPatternParserError(
f"Did not expect judgment number at position {num_position}"
)

if subcourt_position:
return [2, subcourt_position, 1, num_position]
else:
return [2, 1, 3]
raise URLPatternParserError(
f"Did not expect judgment number at position {num_position}"
)

if subcourt_position:
return [2, subcourt_position, 1, num_position]
else:
return [2, 1, 3]

year_regex = r"\[(\d{4})\]"
num_regex = r"(\d+)"

courtlist = courts.get_all()
url_patterns = sorted(list(set(court.ncn for court in courtlist if court.ncn)))

court_strings = set()
subcourt_strings = set()
for url_pattern in url_patterns:
print(url_pattern, url_order_numbers(url_pattern), url_order(url_pattern))
court_strings.add(url_pattern[0])
pattern = ParsedURLPattern(url_pattern)
print(pattern.regex, pattern.url_order_numbers, pattern.url_order)
court_strings.add(pattern.court)
if pattern.subcourt:
subcourt_strings.add(pattern.subcourt)

print(court_strings, subcourt_strings)

0 comments on commit df1fab1

Please sign in to comment.