Skip to content

Commit

Permalink
updated docs
Browse files Browse the repository at this point in the history
  • Loading branch information
sivakumar-mahalingam committed Jan 8, 2025
1 parent 2726cd2 commit 21cadc9
Show file tree
Hide file tree
Showing 3 changed files with 88 additions and 32 deletions.
32 changes: 28 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
# Fast MRZ

<div align="center">

[![License](https://img.shields.io/badge/license-AGPL%203.0-34D058?color=blue)](https://github.com/sivakumar-mahalingam/fastmrz/blob/main/LICENSE)
[![Downloads](https://static.pepy.tech/badge/fastmrz)](https://pypistats.org/packages/fastmrz)
![Python](https://img.shields.io/badge/python-3.8%20%7C%203.9%20%7C%203.10%20%7C%203.11%20%7C%203.12-blue?logo=python&logoColor=959DA5)
Expand All @@ -10,7 +12,18 @@
<img src="https://raw.githubusercontent.com/sivakumar-mahalingam/fastmrz/main/docs/FastMRZ.png" target="_blank" />
</a>

This repository extracts the Machine Readable Zone (MRZ) from document images. The MRZ typically contains important information such as the document holder's name, nationality, document number, date of birth, etc.
FastMRZ is an open-source Python package that extracts the Machine Readable Zone (MRZ) from passports and other documents. FastMRZ accepts various input formats such as Image, Base64 string, MRZ string, or NumPy array.

[Features](#features)
[Built With](#built-with)
[Prerequisites](#prerequisites)
[Installation](#installation)
[Example](#example)
[Wiki](#wiki)
[ToDo](#todo)
[Contributing](#contributing)

</div>

## ️✨Features:

Expand Down Expand Up @@ -93,7 +106,7 @@ P<GBRPUDARSAN<<HENERT<<<<<<<<<<<<<<<<<<<<<<<
7077979792GBR9505209M1704224<<<<<<<<<<<<<<00
```
## 📃MRZ Wiki
## 📃Wiki
<details>
<summary><b>MRZ Types & Format</b></summary>
Expand Down Expand Up @@ -125,18 +138,29 @@ Now, based on the example of a national passport, let us take a closer look at t
- [x] Function to return mrz text as output
- [ ] Bulk process
- [ ] Train Tesseract model with additional data
- [ ] Add function parameter - include_checkdigit
- [x] Add function parameter - include_checkdigit
- [ ] Add function - get_mrz_image
- [x] Add function - validate_mrz
- [ ] Add function - generate_mrz
- [ ] Extract face image
- [ ] Add documentation page
## 🤝 Contributing
Contributions are welcome! Here's how you can help:
1. Fork the repository
2. Create a new branch (`git checkout -b feature/amazing-feature`)
3. Make your changes
4. Commit your changes (`git commit -m 'feat: add amazing feature'`)
5. Push to the branch (`git push origin feature/amazing-feature`)
6. Open a Pull Request
## ⚖️License
Distributed under the AGPL-3.0 License. See `LICENSE` for more information.
## Show your support
## 🙏Show your support
Give a ⭐️ if <a href="https://github.com/sivakumar-mahalingam/fastmrz/">this</a> project helped you!
86 changes: 59 additions & 27 deletions fastmrz/fastmrz.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,16 +75,16 @@ def _cleanse_roi(self, mrz_text):

return "\n".join(new_list)

def _get_final_check_digit(self, input_string, input_type):
def _get_final_checkdigit(self, input_string, input_type):
if input_type == "TD3":
return self._get_check_digit(input_string[:10] + input_string[13:20] + input_string[21:43])
return self._get_checkdigit(input_string[:10] + input_string[13:20] + input_string[21:43])
elif input_type == "TD2":
return self._get_check_digit(input_string[:10] + input_string[13:20] + input_string[21:35])
return self._get_checkdigit(input_string[:10] + input_string[13:20] + input_string[21:35])
else:
return self._get_check_digit(input_string[0][5:] + input_string[1][:7] + input_string[1][8:15]
return self._get_checkdigit(input_string[0][5:] + input_string[1][:7] + input_string[1][8:15]
+ input_string[1][18:29])

def _get_check_digit(self, input_string):
def _get_checkdigit(self, input_string):
weights_pattern = [7, 3, 1]

total = 0
Expand Down Expand Up @@ -130,15 +130,23 @@ def _get_mrz(self, image):

return self._cleanse_roi(mrz_roi)

def _base64_to_image_array(self, base64_string):
def _image_to_base64(self, imagepath):
image_file = open(imagepath, "rb")
image_data = image_file.read()
image_file.close()
base64_string = base64.b64encode(image_data).decode("utf-8")

return base64_string

def _base64_to_array(self, base64_string):
image_data = base64.b64decode(base64_string)
image_stream = BytesIO(image_data)
image = Image.open(image_stream)
image_array = np.array(image)

return image_array

def _parse_mrz(self, mrz_text):
def _parse_mrz(self, mrz_text, include_checkdigit=True):
if not mrz_text:
return {"status": "FAILURE", "message": "No MRZ detected"}
mrz_lines = mrz_text.strip().split("\n")
Expand All @@ -165,39 +173,55 @@ def _parse_mrz(self, mrz_text):

# Line 2
mrz_code_dict["document_number"] = mrz_lines[1][:9].replace("<", "")
mrz_code_dict["document_number_checkdigit"] = self._get_check_digit(mrz_code_dict["document_number"])
if mrz_code_dict["document_number_checkdigit"] != mrz_lines[1][9]:
document_number_checkdigit = self._get_checkdigit(mrz_code_dict["document_number"])
if document_number_checkdigit != mrz_lines[1][9]:
return {"status": "FAILURE", "message": "Document number checksum is not matching"}
if include_checkdigit:
mrz_code_dict["document_number_checkdigit"] = document_number_checkdigit

mrz_code_dict["nationality_code"] = mrz_lines[1][10:13]
if not mrz_code_dict["nationality_code"].isalpha():
return {"status": "FAILURE", "message": "Invalid MRZ format"}

mrz_code_dict["birth_date"] = mrz_lines[1][13:19]
if self._get_check_digit(mrz_code_dict["birth_date"]) != mrz_lines[1][19]:
birth_date_checkdigit = self._get_checkdigit(mrz_code_dict["birth_date"])
if birth_date_checkdigit != mrz_lines[1][19]:
return {"status": "FAILURE", "message": "Date of birth checksum is not matching"}
if include_checkdigit:
mrz_code_dict["birth_date_checkdigit"] = birth_date_checkdigit
mrz_code_dict["birth_date"] = self._format_date(mrz_code_dict["birth_date"])

mrz_code_dict["sex"] = mrz_lines[1][20]

mrz_code_dict["expiry_date"] = mrz_lines[1][21:27]
if self._get_check_digit(mrz_code_dict["expiry_date"]) != mrz_lines[1][27]:
expiry_date_checkdigit = self._get_checkdigit(mrz_code_dict["expiry_date"])
if expiry_date_checkdigit != mrz_lines[1][27]:
return {"status": "FAILURE", "message": "Date of expiry checksum is not matching"}
if include_checkdigit:
mrz_code_dict["expiry_date_checkdigit"] = expiry_date_checkdigit
mrz_code_dict["expiry_date"] = self._format_date(mrz_code_dict["expiry_date"])
mrz_code_dict["birth_date"] = self._get_birth_date(mrz_code_dict["birth_date"], mrz_code_dict["expiry_date"])

if mrz_code_dict["mrz_type"] == "TD2":
mrz_code_dict["optional_data"] = mrz_lines[1][28:35].strip("<")
elif mrz_code_dict["mrz_type"] == "TD3":
mrz_code_dict["optional_data"] = mrz_lines[1][28:42].strip("<")
optional_data_checkdigit = self._get_checkdigit(mrz_code_dict["optional_data"].strip("<"))
if optional_data_checkdigit != mrz_lines[1][42]:
return {"status": "FAILURE", "message": "Optional data checksum is not matching"}
if include_checkdigit:
mrz_code_dict["optional_data_checkdigit"] = optional_data_checkdigit
elif mrz_code_dict["mrz_type"] == "MRVA":
mrz_code_dict["optional_data"] = mrz_lines[1][28:44].strip("<")
else:
mrz_code_dict["optional_data"] = mrz_lines[1][28:36].strip("<")

if (mrz_lines[1][-1] != self._get_final_check_digit(mrz_lines[1], mrz_code_dict["mrz_type"])
final_checkdigit = self._get_final_checkdigit(mrz_lines, mrz_code_dict["mrz_type"])
if (mrz_lines[1][-1] != final_checkdigit
and mrz_code_dict["mrz_type"] not in ("MRVA", "MRVB")):
return {"status": "FAILURE", "message": "Final checksum is not matching"}
if include_checkdigit:
mrz_code_dict["final_checkdigit"] = final_checkdigit
else:
mrz_code_dict["mrz_type"] = "TD1"

Expand All @@ -209,23 +233,31 @@ def _parse_mrz(self, mrz_text):
return {"status": "FAILURE", "message": "Invalid MRZ format"}

mrz_code_dict["document_number"] = mrz_lines[0][5:14]
mrz_code_dict["document_number_checkdigit"] = self._get_check_digit(mrz_code_dict["document_number"])
if mrz_code_dict["document_number_checkdigit"] != mrz_lines[0][14]:
document_number_checkdigit = self._get_checkdigit(mrz_code_dict["document_number"])
if document_number_checkdigit != mrz_lines[0][14]:
return {"status": "FAILURE", "message": "Document number checksum is not matching"}
if include_checkdigit:
mrz_code_dict["document_number_checkdigit"] = document_number_checkdigit

mrz_code_dict["optional_data_1"] = mrz_lines[0][15:].strip("<")

# Line 2
mrz_code_dict["birth_date"] = mrz_lines[1][:6]
if self._get_check_digit(mrz_code_dict["birth_date"]) != mrz_lines[1][6]:
birth_date_checkdigit = self._get_checkdigit(mrz_code_dict["birth_date"])
if birth_date_checkdigit != mrz_lines[1][6]:
return {"status": "FAILURE", "message": "Date of birth checksum is not matching"}
if include_checkdigit:
mrz_code_dict["birth_date_checkdigit"] = birth_date_checkdigit
mrz_code_dict["birth_date"] = self._format_date(mrz_code_dict["birth_date"])

mrz_code_dict["sex"] = mrz_lines[1][7]

mrz_code_dict["expiry_date"] = mrz_lines[1][8:14]
if self._get_check_digit(mrz_code_dict["expiry_date"]) != mrz_lines[1][14]:
expiry_date_checkdigit = self._get_checkdigit(mrz_code_dict["expiry_date"])
if expiry_date_checkdigit != mrz_lines[1][14]:
return {"status": "FAILURE", "message": "Date of expiry checksum is not matching"}
if include_checkdigit:
mrz_code_dict["expiry_date_checkdigit"] = expiry_date_checkdigit
mrz_code_dict["expiry_date"] = self._format_date(mrz_code_dict["expiry_date"])

mrz_code_dict["birth_date"] = self._get_birth_date(mrz_code_dict["birth_date"], mrz_code_dict["expiry_date"])
Expand All @@ -235,8 +267,11 @@ def _parse_mrz(self, mrz_text):
return {"status": "FAILURE", "message": "Invalid MRZ format"}

mrz_code_dict["optional_data_2"] = mrz_lines[0][18:29].strip("<")
if mrz_lines[1][-1] != self._get_final_check_digit(mrz_lines, mrz_code_dict["mrz_type"]):
final_checkdigit = self._get_final_checkdigit(mrz_lines, mrz_code_dict["mrz_type"])
if mrz_lines[1][-1] != final_checkdigit:
return {"status": "FAILURE", "message": "Final checksum is not matching"}
if include_checkdigit:
mrz_code_dict["final_checkdigit"] = final_checkdigit

# Line 3
names = mrz_lines[2].split("<<")
Expand All @@ -263,31 +298,28 @@ def get_details(self, input_data, input_type="imagepath", ignore_parse=False, in
if input_type == "imagepath":
if not self._is_valid(input_data):
raise ValueError("Input is not a valid image file.")
image_file = open(input_data, "rb")
image_data = image_file.read()
image_file.close()
base64_string = base64.b64encode(image_data).decode("utf-8")
image_array = self._base64_to_image_array(base64_string)
base64_string = self._image_to_base64(input_data)
image_array = self._base64_to_array(base64_string)
mrz_text = self._get_mrz(image_array)

return mrz_text if ignore_parse else self._parse_mrz(mrz_text)
return mrz_text if ignore_parse else self._parse_mrz(mrz_text, include_checkdigit=include_checkdigit)
elif input_type == "numpy":
if not self._is_valid(input_data):
raise ValueError("Input is not a valid NumPy array.")
mrz_text = self._get_mrz(input_data)

return mrz_text if ignore_parse else self._parse_mrz(mrz_text)
return mrz_text if ignore_parse else self._parse_mrz(mrz_text, include_checkdigit=include_checkdigit)
elif input_type == "base64":
image_array = self._base64_to_image_array(input_data)
image_array = self._base64_to_array(input_data)
mrz_text = self._get_mrz(image_array)

return mrz_text if ignore_parse else self._parse_mrz(mrz_text)
return mrz_text if ignore_parse else self._parse_mrz(mrz_text, include_checkdigit=include_checkdigit)
elif input_type == "pdf":
# get_details_from_pdf(input_data, ignore_parse=False, include_checkdigit=True)
pass
elif input_type == "text":
mrz_text = self._cleanse_roi(input_data)

return mrz_text if ignore_parse else self._parse_mrz(mrz_text)
return mrz_text if ignore_parse else self._parse_mrz(mrz_text, include_checkdigit=include_checkdigit)
else:
raise ValueError(f"Unsupported input_type: {input_type}")
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

setup(
name="fastmrz",
version="1.3",
version="2.0",
author="Sivakumar Mahalingam",
description="Extracts the Machine Readable Zone (MRZ) data from document images",
long_description=long_description,
Expand Down

0 comments on commit 21cadc9

Please sign in to comment.