Skip to content

Commit

Permalink
Cambiado el lector de texto nativo por la librería pdfminer para la e…
Browse files Browse the repository at this point in the history
…xtracción de metadatos.
  • Loading branch information
YM162 committed Aug 21, 2022
1 parent 95972a6 commit 776868b
Show file tree
Hide file tree
Showing 4 changed files with 7 additions and 16 deletions.
23 changes: 7 additions & 16 deletions PDFU/PDFU_extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,21 +5,11 @@
from pdfrw.objects import PdfName
from pikepdf import Pdf
import re
from pdfminer.high_level import extract_text

def meta(prepdf):
#print(prepdf.pages[0]["/Contents"][0].read_bytes())
page = prepdf.pages[0]
instructions = pikepdf.parse_content_stream(page)
data = pikepdf.unparse_content_stream(instructions)
data = data.decode('ascii')
pattern="<(.*?)>"

metalist = []

for substring in re.findall(pattern,data):
bytes_object = bytes.fromhex(substring)
text = bytes_object.decode("latin-1")
metalist.append(text)
def meta(pdf_path):
text = extract_text(pdf_path,maxpages=1)
metalist = list(filter(None,text.splitlines()))
metadict = {
"Archivo":metalist[0],
"Autor":metalist[1],
Expand All @@ -40,10 +30,11 @@ def deembed(pdf_path):
pdf_path: The path where the pdf file is located.
Returns:
return_msg: Dict. with three values:
return_msg: Dict. with four values:
Success: bool indicating whether the process was successful.
return_path: If successful, returns the path of the deembedded file.
Error: If unsuccessful, returns a description of the error.
Meta: Dictionary with information about the file.
'''
print("Trying to Deembed:",pdf_path)
return_msg={"Success":False,"return_path":"","Error":"","Meta":{}}
Expand All @@ -56,7 +47,7 @@ def deembed(pdf_path):
prepdf=Pdf.open(pdf_path)

try:
metadict = meta(prepdf)
metadict = meta(pdf_path)
return_msg["Meta"]=metadict
except:
print("Meta not extracted. Probably not a W file.")
Expand Down
Binary file added PDFU/__pycache__/PDFU_extract.cpython-39.pyc
Binary file not shown.
Binary file added dist/PDFU-0.2.1-py3-none-any.whl
Binary file not shown.
Binary file added dist/PDFU-0.2.1.tar.gz
Binary file not shown.

0 comments on commit 776868b

Please sign in to comment.