Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[FIX] Find non-pdf tech notes #363

Merged
merged 3 commits into from
Aug 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 26 additions & 16 deletions datman/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -1334,11 +1334,7 @@ def make_zip(source_dir, dest_zip):


def find_tech_notes(folder):
"""Find any technotes located within a folder.

If only one PDF is found it is assumed to be the tech notes. If multiple
are found, unless one contains the string 'TechNotes', the first pdf is
guessed to be the tech notes.
"""Find any technotes located within a given folder.

Args:
folder (str): A full path to a folder to search.
Expand All @@ -1347,21 +1343,35 @@ def find_tech_notes(folder):
path (str): The full path to the tech notes or an empty string if
none have been found.
"""
pdf_list = []
for root, dirs, files in os.walk(folder):
exts = ["pdf", "png", "jpg"]
notes = []
for root, _, files in os.walk(folder):
for fname in files:
if ".pdf" in fname:
pdf_list.append(os.path.join(root, fname))
if any([fname.endswith(ext) for ext in exts]):
notes.append(os.path.join(root, fname))

if not notes:
return ""

if not pdf_list:
# find the file most likely to be the tech notes
scored = []
for item in notes:
score = 0
if "tech" in item.lower():
score += 3
if "note" in item.lower():
score += 2
if item.endswith("pdf"):
score += 1
scored.append((item, score))

result = sorted(scored, key=lambda x: x[1], reverse=True)

if result[0][1] == 0:
# No files scored as likely to be the notes
return ""
elif len(pdf_list) > 1:
for pdf in pdf_list:
file_name = os.path.basename(pdf)
if 'technotes' in file_name.lower():
return pdf

return pdf_list[0]
return result[0][0]


def read_json(path):
Expand Down
44 changes: 40 additions & 4 deletions tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,7 @@ def test_catches_invalid_site_in_kcni_id(self, dm_config):

class FindTechNotes(unittest.TestCase):
notes = "TechNotes.pdf"
jpg_notes = "TechNotes.jpg"
other_pdf1 = "SomeFile.pdf"
other_pdf2 = "otherFile.pdf"
path = "./resources"
Expand All @@ -136,7 +137,7 @@ def test_doesnt_crash_with_broken_path(self):
@patch('os.walk', autospec=True)
def test_doesnt_crash_when_no_tech_notes_exist(self, mock_walk):
mock_walk.return_value = self.__mock_file_system(
randint(1, 10), add_notes=False)
randint(1, 10), add_pdf_notes=False)

found_file = utils.find_tech_notes(self.path)

Expand All @@ -163,20 +164,55 @@ def test_returns_tech_notes_when_multiple_pdfs_present(self, mock_walk):
def test_first_file_returned_when_multiple_pdfs_but_no_tech_notes(
self, mock_walk):
mock_walk.return_value = self.__mock_file_system(
randint(1, 10), add_notes=False, add_pdf=True)
randint(1, 10), add_pdf_notes=False, add_pdf=True)

found_file = utils.find_tech_notes(self.path)

assert os.path.basename(found_file) == self.other_pdf1

def __mock_file_system(self, depth, add_notes=True, add_pdf=False):
@patch('os.walk', autospec=True)
def test_finds_non_pdf_tech_notes(self, mock_walk):
mock_walk.return_value = self.__mock_file_system(
randint(1, 10), add_pdf_notes=False, add_pdf=True,
add_jpg_notes=True)

found_file = utils.find_tech_notes(self.path)

assert os.path.basename(found_file) == self.jpg_notes

@patch('os.walk', autospec=True)
def test_doesnt_pick_similarly_named_file(self, mock_walk):
mock_walk.return_value = self.__mock_file_system(
randint(1, 10), add_pdf_notes=False, add_pdf=True,
add_jpg_notes=True, add_jpgs=True)

found_file = utils.find_tech_notes(self.path)

assert os.path.basename(found_file) == self.jpg_notes

@patch('os.walk', autospec=True)
def test_prefers_pdf_notes_over_other_formats(self, mock_walk):
mock_walk.return_value = self.__mock_file_system(
randint(1, 10), add_pdf_notes=True, add_jpg_notes=True,
add_pdf=True, add_jpgs=True)

found_file = utils.find_tech_notes(self.path)

assert os.path.basename(found_file) == self.notes

def __mock_file_system(self, depth, add_pdf_notes=True, add_jpgs=False,
add_jpg_notes=False, add_pdf=False):
walk_list = []
cur_path = self.path
file_list = ["file1.txt", "file2"]
if add_pdf:
file_list.extend([self.other_pdf1, self.other_pdf2])
if add_notes:
if add_jpg_notes:
file_list.extend([self.jpg_notes])
if add_pdf_notes:
file_list.append(self.notes)
if add_jpgs:
file_list.extend(['SpiralView.jpg', 'RANotes.jpg'])
for num in range(1, depth + 1):
cur_path = cur_path + "/dir{}".format(num)
dirs = ("dir{}".format(num + 1), )
Expand Down
Loading