Implement download tickers to local storage

dgunning · Oct 9, 2024 · 3aa47df · 3aa47df
1 parent b30b4eb
commit 3aa47df
Show file tree

Hide file tree

Showing 5 changed files with 97 additions and 38 deletions.
diff --git a/edgar/core.py b/edgar/core.py
@@ -81,6 +81,7 @@
     'use_local_storage',
     'run_async_or_sync',
     'download_edgar_data',
+    'get_edgar_data_directory',
     'default_page_size',
     'InvalidDateException',
 ]
@@ -219,6 +220,7 @@ def get_identity() -> str:
     return identity
 
 
+@lru_cache(maxsize=None)
 def get_edgar_data_directory() -> Path:
     """Get the edgar data directory"""
     default_local_data_dir = Path(os.path.join(os.path.expanduser("~"), ".edgar"))
@@ -235,11 +237,14 @@ def use_local_storage(use_local: bool = True):
     os.environ['EDGAR_USE_LOCAL_DATA'] = "1" if use_local else "0"
 
 
-def download_edgar_data(submissions: bool = True, facts: bool = True):
+def download_edgar_data(submissions: bool = True,
+                        facts: bool = True,
+                        reference:bool = True):
     """
     Download Edgar data to the local storage directory
     :param submissions: Download submissions
     :param facts: Download facts
+    :param reference: Download reference data
     """
     if submissions:
         from edgar.entities import download_submissions

diff --git a/edgar/httprequests.py b/edgar/httprequests.py
@@ -18,7 +18,7 @@
 
 __all__ = ["get_with_retry", "get_with_retry_async", "stream_with_retry", "post_with_retry", "post_with_retry_async",
            "download_file", "download_file_async", "download_json", "download_json_async", "stream_file",
-           "download_text", "download_text_between_tags", "download_bulk_data"]
+           "download_text", "download_text_between_tags", "download_bulk_data", "download_datafile"]
 
 attempts = 6
 retry_timeout = 40
@@ -105,7 +105,7 @@ def print_metrics(self):
         print(f"Peak call rate: {metrics['peak_call_rate']:.2f} calls per second")
 
 
-_throttler_instances = {} # Singleton instance for throttler
+_throttler_instances = {}  # Singleton instance for throttler
 
 
 def throttle_requests(request_rate=None, requests_per_second=None, **kwargs):
@@ -463,7 +463,8 @@ async def download_file_async(url: str, as_text: bool = None, path: Optional[Uni
 @retry(on=httpx.RequestError, attempts=attempts, timeout=retry_timeout, wait_initial=wait_initial)
 @with_identity
 @throttle_requests(requests_per_second=max_requests_per_second)
-async def stream_file(url: str, as_text: bool = None, path: Optional[Union[str, Path]] = None, **kwargs) -> Union[str, bytes, None]:
+async def stream_file(url: str, as_text: bool = None, path: Optional[Union[str, Path]] = None, **kwargs) -> Union[
+    str, bytes, None]:
     """
     Download a file from a URL asynchronously with progress bar using httpx.
 
@@ -585,7 +586,7 @@ def download_text_between_tags(url: str, tag: str):
     return content
 
 
-async def download_bulk_data(data_url:str) -> Path:
+async def download_bulk_data(data_url: str) -> Path:
     """
     Download bulk data e.g. company facts, daily index, etc. from the SEC website
     e.g. "https://www.sec.gov/Archives/edgar/daily-index/xbrl/companyfacts.zip"
@@ -599,12 +600,27 @@ async def download_bulk_data(data_url:str) -> Path:
     if not download_path.exists():
         download_path.mkdir()
 
-    # Now stream the file to the data directory
-    await stream_file(data_url, as_text=False, path=download_path)
+    as_text = data_url.endswith('.zip')
+
     if filename.endswith(".zip"):
+        # Now stream the file to the data directory
+        await stream_file(data_url, as_text=as_text, path=download_path)
         # Unzip the file to the data directory / file
         with zipfile.ZipFile(download_filename, 'r') as z:
             z.extractall(download_path)
         # Delete the zip file
         download_filename.unlink()
     return download_path
+
+
+def download_datafile(data_url: str, local_directory:Path=None) -> Path:
+    """Download a file to the local storage directory"""
+    filename = os.path.basename(data_url)
+    # Create the directory if it doesn't exist
+    local_directory = local_directory or get_edgar_data_directory()
+    if not local_directory.exists():
+        local_directory.mkdir()
+
+    download_filename = local_directory / filename
+    download_file(data_url, path=download_filename)
+    return download_filename
diff --git a/edgar/reference/tickers.py b/edgar/reference/tickers.py
@@ -3,19 +3,22 @@
 from functools import lru_cache
 from io import StringIO
 from typing import Optional, Union, List
-
+from pathlib import Path
 import pandas as pd
 import pyarrow as pa
 from httpx import HTTPStatusError
-
-from edgar.core import listify
-from edgar.httprequests import download_file, download_json
+from edgar.core import listify, log
+from edgar.httprequests import download_file, download_json, download_datafile
 from edgar.reference.data.common import read_parquet_from_package
 
 __all__ = ['cusip_ticker_mapping', 'get_ticker_from_cusip', 'get_company_tickers', 'get_icon_from_ticker', 'find_cik',
            'get_cik_tickers', 'get_company_ticker_name_exchange', 'get_companies_by_exchange',
            'get_mutual_fund_tickers', 'find_mutual_fund_cik']
 
+ticker_txt_url = "https://www.sec.gov/include/ticker.txt"
+company_tickers_json_url = "https://www.sec.gov/files/company_tickers.json"
+mutual_fund_tickers_url = "https://www.sec.gov/files/company_tickers_mf.json"
+company_tickers_exchange_url = "https://www.sec.gov/files/company_tickers_exchange.json"
 
 @lru_cache(maxsize=1)
 def cusip_ticker_mapping(allow_duplicate_cusips: bool = True) -> pd.DataFrame:
@@ -47,7 +50,7 @@ def get_cik_tickers():
         data = pd.DataFrame.from_dict(json_data, orient='index')
         data = data.rename(columns={'ticker': 'ticker', 'cik_str': 'cik'})
         data = data[['ticker', 'cik']]
-        
+
         # Ensure CIK is treated as an integer
         data['cik'] = data['cik'].astype(int)
 
@@ -234,3 +237,13 @@ def get_icon_from_ticker(ticker: str) -> Optional[bytes]:
             return None
         else:
             raise
+
+def download_ticker_data(reference_data_directory: Path):
+    """
+    Download reference data from the SEC website.
+    """
+    log.info(f"Downloading ticker data to {reference_data_directory}")
+    download_datafile(ticker_txt_url, reference_data_directory)
+    download_datafile(company_tickers_json_url, reference_data_directory)
+    download_datafile(mutual_fund_tickers_url, reference_data_directory)
+    download_datafile(company_tickers_exchange_url, reference_data_directory)
diff --git a/tests/test_core.py b/tests/test_core.py
@@ -1,14 +1,16 @@
 import datetime
+import os
+import tempfile
 from datetime import datetime
-from freezegun import freeze_time
+
 import pandas as pd
 import pyarrow as pa
 import pyarrow.compute as pc
 import pytest
+from freezegun import freeze_time
 from rich.table import Table
-
+from pathlib import Path
 import edgar
-from edgar.richtools import *
 from edgar.core import (decode_content,
                         get_identity,
                         set_identity,
@@ -24,7 +26,9 @@
                         reverse_name,
                         get_bool,
                         is_start_of_quarter,
-                        split_camel_case)
+                        split_camel_case,
+                        download_edgar_data)
+from edgar.richtools import *
 
 
 def test_decode_content():
@@ -175,6 +179,7 @@ def test_filter_by_form():
     assert len(filter_by_form(table, form=['10-K', '10-Q', '10-K/A'], amendments=False)) == 3
     assert len(filter_by_form(table, form=['10-K', '10-Q', '10-K/A'], amendments=True)) == 4
 
+
 def test_filter_by_cik():
     arrays = [pa.array(['a', 'b', 'c', 'd', 'e']),
               pa.array([3, 2, 1, 4, 4]),
@@ -267,6 +272,7 @@ def test_get_bool():
     assert get_bool("TRUE")
     assert get_bool("True")
 
+
 def test_split_camel_case():
     assert split_camel_case("CoverPage") == "Cover Page"
     assert split_camel_case("CONSOLIDATEDBALANCESHEETS") == "CONSOLIDATEDBALANCESHEETS"
@@ -276,18 +282,18 @@ def test_split_camel_case():
 
 
 @pytest.mark.parametrize("test_date, expected_result", [
-    ("2024-01-01", True),   # New Year's Day (start of Q1)
-    ("2024-01-02", True),   # First business day after New Year's
+    ("2024-01-01", True),  # New Year's Day (start of Q1)
+    ("2024-01-02", True),  # First business day after New Year's
     ("2024-01-03", False),  # Second business day after New Year's
     ("2024-03-31", False),  # Last day of Q1
-    ("2024-04-01", True),   # First day of Q2
-    ("2024-04-02", True),   # Possibly first business day of Q2
+    ("2024-04-01", True),  # First day of Q2
+    ("2024-04-02", True),  # Possibly first business day of Q2
     ("2024-04-03", False),  # Second business day of Q2
-    ("2024-07-01", True),   # First day of Q3
-    ("2024-07-02", True),   # Possibly first business day of Q3
+    ("2024-07-01", True),  # First day of Q3
+    ("2024-07-02", True),  # Possibly first business day of Q3
     ("2024-07-03", False),  # Second business day of Q3
-    ("2024-10-01", True),   # First day of Q4
-    ("2024-10-02", True),   # Possibly first business day of Q4
+    ("2024-10-01", True),  # First day of Q4
+    ("2024-10-02", True),  # Possibly first business day of Q4
     ("2024-10-03", False),  # Second business day of Q4
     ("2024-12-31", False),  # Last day of Q4
     ("2024-05-15", False),  # Random day in middle of quarter
@@ -296,14 +302,27 @@ def test_is_start_of_quarter(test_date, expected_result):
     with freeze_time(test_date):
         assert is_start_of_quarter() == expected_result
 
+
 @pytest.mark.parametrize("test_datetime, expected_result", [
-    ("2024-01-01 00:00:01", True),   # Just after midnight on New Year's
-    ("2024-01-02 23:59:59", True),   # Just before midnight on Jan 2
+    ("2024-01-01 00:00:01", True),  # Just after midnight on New Year's
+    ("2024-01-02 23:59:59", True),  # Just before midnight on Jan 2
     ("2024-01-03 00:00:01", False),  # Just after midnight on Jan 3
-    ("2024-04-01 12:00:00", True),   # Noon on first day of Q2
-    ("2024-07-01 18:30:00", True),   # Evening on first day of Q3
-    ("2024-10-02 09:00:00", True),   # Morning of possibly first business day of Q4
+    ("2024-04-01 12:00:00", True),  # Noon on first day of Q2
+    ("2024-07-01 18:30:00", True),  # Evening on first day of Q3
+    ("2024-10-02 09:00:00", True),  # Morning of possibly first business day of Q4
 ])
 def test_is_start_of_quarter_with_time(test_datetime, expected_result):
     with freeze_time(test_datetime):
-        assert is_start_of_quarter() == expected_result
+        assert is_start_of_quarter() == expected_result
+
+
+""" 
+def test_download_edgar_data(monkeypatch):
+    with tempfile.TemporaryDirectory() as d:
+        monkeypatch.setenv("EDGAR_LOCAL_DATA_DIR", d)
+        assert os.environ["EDGAR_LOCAL_DATA_DIR"] == d
+        download_edgar_data(submissions=False, facts=False, reference=True)
+        files = set(f.name for f in (Path(d) /"reference").glob("*"))
+        assert files & {'ticker.txt', 'company_tickers_exchange.json', 'company_tickers.json',
+                         'company_tickers_mf.json'}
+"""
diff --git a/tests/test_reference.py b/tests/test_reference.py
@@ -1,10 +1,13 @@
-from edgar.reference import cusip_ticker_mapping, get_ticker_from_cusip, describe_form
-from edgar.reference.tickers import get_cik_tickers, find_cik, get_company_ticker_name_exchange, \
-    get_companies_by_exchange, get_mutual_fund_tickers, find_mutual_fund_cik
 import json
 from unittest.mock import patch
+
 import pandas as pd
 
+from edgar.reference import cusip_ticker_mapping, get_ticker_from_cusip, describe_form
+from edgar.reference.tickers import get_cik_tickers, find_cik, get_company_ticker_name_exchange, \
+    get_companies_by_exchange, get_mutual_fund_tickers, find_mutual_fund_cik, download_ticker_data
+
+
 def test_cusip_ticker_mapping():
     data = cusip_ticker_mapping()
     assert data.loc['15101T102'].Ticker == 'CLXX'
@@ -120,16 +123,19 @@ def test_get_cik_tickers():
                 "1": {"cik_str": 789019, "ticker": "MSFT", "title": "MICROSOFT CORP"}
             })
         ]
-        
+
         # Clear the lru_cache to ensure we're not getting cached results
         get_cik_tickers.cache_clear()
-        
+
         fallback_data = get_cik_tickers()
         assert isinstance(fallback_data, pd.DataFrame), "Fallback result should be a pandas DataFrame"
-        assert set(fallback_data.columns) == {'ticker', 'cik'}, f"Fallback columns should be 'ticker' and 'cik', got {fallback_data.columns}"
+        assert set(fallback_data.columns) == {'ticker',
+                                              'cik'}, f"Fallback columns should be 'ticker' and 'cik', got {fallback_data.columns}"
         assert len(fallback_data) == 2, f"Fallback data should have 2 entries, got {len(fallback_data)}"
-        assert fallback_data['ticker'].tolist() == ['AAPL', 'MSFT'], f"Fallback tickers should be ['AAPL', 'MSFT'], got {fallback_data['ticker'].tolist()}"
-        assert fallback_data['cik'].tolist() == [320193, 789019], f"Fallback CIKs should be [320193, 789019], got {fallback_data['cik'].tolist()}"
+        assert fallback_data['ticker'].tolist() == ['AAPL',
+                                                    'MSFT'], f"Fallback tickers should be ['AAPL', 'MSFT'], got {fallback_data['ticker'].tolist()}"
+        assert fallback_data['cik'].tolist() == [320193,
+                                                 789019], f"Fallback CIKs should be [320193, 789019], got {fallback_data['cik'].tolist()}"
 
         # Verify that download_file was called twice
         assert mock_download.call_count == 2, f"download_file should be called twice, was called {mock_download.call_count} times"