cern-sis · jdm010 · Aug 4, 2023 · Aug 10, 2023 · Aug 10, 2023 · Aug 10, 2023
diff --git a/src/providers/cambridge/README.md b/src/providers/cambridge/README.md
@@ -0,0 +1,3 @@
+CLI for Cambridge University Press.
+
+CLI finds the new releases for a specified subject and stores them as a .tsv in a specified directory.
diff --git a/src/providers/cambridge/cli.py b/src/providers/cambridge/cli.py
@@ -0,0 +1,28 @@
+import click
+from bs4 import BeautifulSoup
+from utils import (
+    get_page_content,
+    find_links_and_tags,
+    download_file
+)
+
+@click.command()
+@click.option('--subject', prompt='Enter the subject', type=click.Choice(['computer science', 'engineering', 'mathematics', 'physics', 'science and engineering', 'statistics']))
+@click.option('--directory', prompt='Enter the directory', type=click.Path(exists=True, file_okay=False, dir_okay=True))
+def main(subject,directory):
+    url = 'https://www.cambridge.org/core/services/librarians/kbart'
+    response = get_page_content(url)
+
+    if response:
+        soup = BeautifulSoup(response.text, 'html.parser')
+        prefix = 'cambridge ebooks and partner presses: 2023 '
+        subjects = [subject]
+        found_links = find_links_and_tags(soup, subjects, prefix)
+        for link, subject in zip(found_links, subjects):
+            full_link = 'https://www.cambridge.org' + link
+            download_file(full_link, subject, directory)
+
+if __name__ == "__main__":
+    main()
+
+
diff --git a/src/providers/cambridge/test_utils.py b/src/providers/cambridge/test_utils.py
@@ -0,0 +1,32 @@
+import pytest
+from utils import get_page_content, find_links_and_tags, download_file
+from bs4 import BeautifulSoup
+import os 
+
+@pytest.mark.vcr()
+def test_page_content_valid():
+    response = get_page_content('https://www.cambridge.org/core/services/librarians/kbart')
+    assert response.status_code == 200
+
+@pytest.mark.vcr()
+def test_page_content_invalid():
+    response = get_page_content('https://www.cambridge.org/core/services/librarians/kbartWontWork')
+    assert response.status_code == 404 
+
+@pytest.mark.vcr
+def test_found_links():
+    url = 'https://www.cambridge.org/core/services/librarians/kbart'
+    response = get_page_content(url)
+    soup = BeautifulSoup(response.text, 'html.parser')
+    found_links = find_links_and_tags(soup, ['computer science'], 'cambridge ebooks and partner presses: 2023 ')
+    assert not len(found_links) == 0
+
+@pytest.mark.vcr
+def test_download_file(tmp_path):
+    url = 'https://www.cambridge.org/core/services/aop-cambridge-core/kbart/create/bespoke/717854B1C18FD5D0B882344E83E6F52B'
+    desired_filename = 'computer science'
+    target_filepath = str(tmp_path) + '/'
+    download_file(url, desired_filename, target_filepath)
+    expected_filepath = os.path.join(target_filepath, f"{desired_filename}.tsv")
+    assert os.path.exists(expected_filepath)
+    assert os.path.getsize(expected_filepath) > 0
diff --git a/src/providers/cambridge/utils.py b/src/providers/cambridge/utils.py
@@ -0,0 +1,36 @@
+import requests
+import structlog
+logger = structlog.get_logger()
+
+def get_page_content(url):
+    try:
+        response = requests.get(url)
+        response.raise_for_status()  # Raise an HTTPError if the response status code indicates an error
+        return response
+    except requests.exceptions.ConnectionError as err:
+        logger.error(err)
+        return response
+    except requests.exceptions.HTTPError as err:
+        logger.error(err)
+        return response
+
+def find_links_and_tags(soup, subjects, prefix):
+    found_links = []
+    for subject in subjects:
+        target_word = prefix + subject
+        for tag in soup.find_all(string=lambda text: text and target_word in text.lower()):
+            parent_tag = tag.parent
+            if parent_tag.name == 'a' and parent_tag.get('href'):
+                link = parent_tag.get('href')
+                found_links.append(link)
+    return found_links
+
+def download_file(url, desired_filename, target_filepath):
+    response = requests.get(url)
+    if response.status_code == 200:
+        filename = f"{desired_filename}.tsv"
+        with open(target_filepath + filename, 'wb') as file:
+            file.write(response.content)
+        print(f'Successfully downloaded {filename}')
+    else:
+        print(f"Error: Failed to download {desired_filename} ({response.status_code})")
diff --git a/src/providers/springer/cli.py b/src/providers/springer/cli.py
@@ -0,0 +1,28 @@
+from bs4 import BeautifulSoup
+import click
+
+from utils import (
+    get_page_content,
+    find_links_and_tags,
+    download_file
+)
+
+@click.command()
+@click.option('--subject', prompt='Enter the subject', type=click.Choice(['chemistry and materials science', 'computer science', 'engineering', 'mathematics and statistics', 'physics and astronomy']))
+@click.option('--directory', prompt='Enter the directory', type=click.Path(exists=True, file_okay=False, dir_okay=True))
+def main(subject,directory):
+    url = 'https://adminportal.springernature.com/metadata/kbart'
+    response = get_page_content(url)
+
+    if response:
+        soup = BeautifulSoup(response.text, 'html.parser')
+        prefix = 'springer '
+        suffix = ' ebooks 2023'
+        subjects = [subject]
+        found_links = find_links_and_tags(soup, subjects, prefix, suffix)
+        for link, subject in zip(found_links, subjects):
+            full_link = 'https://adminportal.springernature.com' + link
+            download_file(full_link, subject, directory)
+
+if __name__ == "__main__":
+    main()
diff --git a/src/providers/springer/test_utils.py b/src/providers/springer/test_utils.py
@@ -0,0 +1,36 @@
+import pytest
+from utils import get_page_content, find_links_and_tags, download_file
+from bs4 import BeautifulSoup
+import os 
+
+@pytest.mark.vcr()
+def test_page_content_valid():
+    response = get_page_content('https://adminportal.springernature.com/metadata/kbart')
+    assert response.status_code == 200
+
+@pytest.mark.vcr()
+def test_page_content_invalid():
+    response = get_page_content('https://adminportal.springernature.com/metadata/errkbart')
+    assert response.status_code == 404  
+
+@pytest.mark.vcr
+def test_found_links():
+    url = 'https://adminportal.springernature.com/metadata/kbart'
+    response = get_page_content(url)
+    if response:
+        soup = BeautifulSoup(response.text, 'html.parser')
+        subjects = ['engineering']
+        founds_links = find_links_and_tags(soup, subjects, 'springer ', ' ebooks 2023')
+        assert len(founds_links) == 1
+
+@pytest.mark.vcr
+def test_download_file(tmp_path):
+    url = 'https://adminportal.springernature.com/metadata/kbart/Springer_Global_Springer_Computer_Science_eBooks_2023_English+International_2023-08-01.txt'
+    desired_filename = 'computer science'
+    target_filepath = str(tmp_path) + '/'
+
+    download_file(url, desired_filename, target_filepath)
+
+    expected_filepath = os.path.join(target_filepath, f"{desired_filename}.tsv")
+    assert os.path.exists(expected_filepath)
+    assert os.path.getsize(expected_filepath) > 0
diff --git a/src/providers/springer/utils.py b/src/providers/springer/utils.py
@@ -0,0 +1,36 @@
+import requests
+import structlog
+logger = structlog.get_logger()
+
+def get_page_content(url):
+    try:
+        response = requests.get(url)
+        response.raise_for_status()  # Raise an HTTPError if the response status code indicates an error
+        return response
+    except requests.exceptions.ConnectionError as err:
+        logger.error(err)
+        return response
+    except requests.exceptions.HTTPError as err:
+        logger.error(err)
+        return response
+
+def find_links_and_tags(soup, subjects, prefix, suffix):
+    found_links = []
+    for subject in subjects:
+        target_word = prefix + subject + suffix
+        for tag in soup.find_all(string=lambda text: text and target_word in text.lower()):
+            parent_tag = tag.parent
+            if parent_tag.name == 'a' and parent_tag.get('href'):
+                link = parent_tag.get('href')
+                found_links.append(link)
+    return found_links
+
+def download_file(url, desired_filename, target_filepath):
+    response = requests.get(url)
+    if response.status_code == 200:
+        filename = f"{desired_filename}.tsv"
+        with open(target_filepath + filename, 'wb') as file:
+            file.write(response.content)
+        print(f'Successfully downloaded {filename}')
+    else:
+        print(f"Error: Failed to download {desired_filename} ({response.status_code})")
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		CLI for Cambridge University Press.

		CLI finds the new releases for a specified subject and stores them as a .tsv in a specified directory.