Skip to content

Commit

Permalink
v0.5.5: fix mupdf parser
Browse files Browse the repository at this point in the history
  • Loading branch information
codereverser committed Aug 6, 2022
1 parent 404dc9a commit d1c2b9d
Show file tree
Hide file tree
Showing 6 changed files with 119 additions and 390 deletions.
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
# Changelog

## 0.5.5 - 2022-08-06
- bug fix with MuPDF parser

## 0.5.4 - 2022-02-01
- bug fix in CAS summary statement parser

Expand Down
10 changes: 5 additions & 5 deletions casparser/parsers/mupdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,10 +95,10 @@ def parse_file_type(blocks):
return FileType.UNKNOWN


def parse_investor_info(page_dict) -> InvestorInfo:
def parse_investor_info(page_dict, page_rect: fitz.Rect) -> InvestorInfo:
"""Parse investor info."""
width = max(page_dict["width"], 600)
height = max(page_dict["height"], 800)
width = max(page_rect.width, 600)
height = max(page_rect.height, 800)

blocks = sorted(
[x for x in page_dict["blocks"] if x["bbox"][1] < height / 2], key=lambda x: x["bbox"][1]
Expand Down Expand Up @@ -190,7 +190,7 @@ def cas_pdf_to_text(filename: Union[str, io.IOBase], password) -> PartialCASData

with fp:
try:
doc = fitz.open(stream=fp.read(), filetype="pdf")
doc = fitz.Document(stream=fp.read(), filetype="pdf")
except Exception as e:
raise CASParseError("Unhandled error while opening file :: %s" % (str(e)))

Expand All @@ -210,7 +210,7 @@ def cas_pdf_to_text(filename: Union[str, io.IOBase], password) -> PartialCASData
file_type = parse_file_type(blocks)
sorted_blocks = sorted(blocks, key=itemgetter(1, 0))
if investor_info is None:
investor_info = parse_investor_info(page_dict)
investor_info = parse_investor_info(page_dict, page.rect)
pages.append(sorted_blocks)
lines = group_similar_rows(pages)
return PartialCASData(file_type=file_type, investor_info=investor_info, lines=lines)
1 change: 0 additions & 1 deletion casparser/process/utils.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
from decimal import Decimal
from typing import Optional, Tuple

from casparser_isin import MFISINDb
Expand Down
Loading

0 comments on commit d1c2b9d

Please sign in to comment.