Skip to content

Commit

Permalink
Implement download tickers to local storage
Browse files Browse the repository at this point in the history
  • Loading branch information
dgunning committed Oct 9, 2024
1 parent b30b4eb commit 3aa47df
Show file tree
Hide file tree
Showing 5 changed files with 97 additions and 38 deletions.
7 changes: 6 additions & 1 deletion edgar/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,7 @@
'use_local_storage',
'run_async_or_sync',
'download_edgar_data',
'get_edgar_data_directory',
'default_page_size',
'InvalidDateException',
]
Expand Down Expand Up @@ -219,6 +220,7 @@ def get_identity() -> str:
return identity


@lru_cache(maxsize=None)
def get_edgar_data_directory() -> Path:
"""Get the edgar data directory"""
default_local_data_dir = Path(os.path.join(os.path.expanduser("~"), ".edgar"))
Expand All @@ -235,11 +237,14 @@ def use_local_storage(use_local: bool = True):
os.environ['EDGAR_USE_LOCAL_DATA'] = "1" if use_local else "0"


def download_edgar_data(submissions: bool = True, facts: bool = True):
def download_edgar_data(submissions: bool = True,
facts: bool = True,
reference:bool = True):
"""
Download Edgar data to the local storage directory
:param submissions: Download submissions
:param facts: Download facts
:param reference: Download reference data
"""
if submissions:
from edgar.entities import download_submissions
Expand Down
28 changes: 22 additions & 6 deletions edgar/httprequests.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@

__all__ = ["get_with_retry", "get_with_retry_async", "stream_with_retry", "post_with_retry", "post_with_retry_async",
"download_file", "download_file_async", "download_json", "download_json_async", "stream_file",
"download_text", "download_text_between_tags", "download_bulk_data"]
"download_text", "download_text_between_tags", "download_bulk_data", "download_datafile"]

attempts = 6
retry_timeout = 40
Expand Down Expand Up @@ -105,7 +105,7 @@ def print_metrics(self):
print(f"Peak call rate: {metrics['peak_call_rate']:.2f} calls per second")


_throttler_instances = {} # Singleton instance for throttler
_throttler_instances = {} # Singleton instance for throttler


def throttle_requests(request_rate=None, requests_per_second=None, **kwargs):
Expand Down Expand Up @@ -463,7 +463,8 @@ async def download_file_async(url: str, as_text: bool = None, path: Optional[Uni
@retry(on=httpx.RequestError, attempts=attempts, timeout=retry_timeout, wait_initial=wait_initial)
@with_identity
@throttle_requests(requests_per_second=max_requests_per_second)
async def stream_file(url: str, as_text: bool = None, path: Optional[Union[str, Path]] = None, **kwargs) -> Union[str, bytes, None]:
async def stream_file(url: str, as_text: bool = None, path: Optional[Union[str, Path]] = None, **kwargs) -> Union[
str, bytes, None]:
"""
Download a file from a URL asynchronously with progress bar using httpx.
Expand Down Expand Up @@ -585,7 +586,7 @@ def download_text_between_tags(url: str, tag: str):
return content


async def download_bulk_data(data_url:str) -> Path:
async def download_bulk_data(data_url: str) -> Path:
"""
Download bulk data e.g. company facts, daily index, etc. from the SEC website
e.g. "https://www.sec.gov/Archives/edgar/daily-index/xbrl/companyfacts.zip"
Expand All @@ -599,12 +600,27 @@ async def download_bulk_data(data_url:str) -> Path:
if not download_path.exists():
download_path.mkdir()

# Now stream the file to the data directory
await stream_file(data_url, as_text=False, path=download_path)
as_text = data_url.endswith('.zip')

if filename.endswith(".zip"):
# Now stream the file to the data directory
await stream_file(data_url, as_text=as_text, path=download_path)
# Unzip the file to the data directory / file
with zipfile.ZipFile(download_filename, 'r') as z:
z.extractall(download_path)
# Delete the zip file
download_filename.unlink()
return download_path


def download_datafile(data_url: str, local_directory:Path=None) -> Path:
"""Download a file to the local storage directory"""
filename = os.path.basename(data_url)
# Create the directory if it doesn't exist
local_directory = local_directory or get_edgar_data_directory()
if not local_directory.exists():
local_directory.mkdir()

download_filename = local_directory / filename
download_file(data_url, path=download_filename)
return download_filename
23 changes: 18 additions & 5 deletions edgar/reference/tickers.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,19 +3,22 @@
from functools import lru_cache
from io import StringIO
from typing import Optional, Union, List

from pathlib import Path
import pandas as pd
import pyarrow as pa
from httpx import HTTPStatusError

from edgar.core import listify
from edgar.httprequests import download_file, download_json
from edgar.core import listify, log
from edgar.httprequests import download_file, download_json, download_datafile
from edgar.reference.data.common import read_parquet_from_package

__all__ = ['cusip_ticker_mapping', 'get_ticker_from_cusip', 'get_company_tickers', 'get_icon_from_ticker', 'find_cik',
'get_cik_tickers', 'get_company_ticker_name_exchange', 'get_companies_by_exchange',
'get_mutual_fund_tickers', 'find_mutual_fund_cik']

ticker_txt_url = "https://www.sec.gov/include/ticker.txt"
company_tickers_json_url = "https://www.sec.gov/files/company_tickers.json"
mutual_fund_tickers_url = "https://www.sec.gov/files/company_tickers_mf.json"
company_tickers_exchange_url = "https://www.sec.gov/files/company_tickers_exchange.json"

@lru_cache(maxsize=1)
def cusip_ticker_mapping(allow_duplicate_cusips: bool = True) -> pd.DataFrame:
Expand Down Expand Up @@ -47,7 +50,7 @@ def get_cik_tickers():
data = pd.DataFrame.from_dict(json_data, orient='index')
data = data.rename(columns={'ticker': 'ticker', 'cik_str': 'cik'})
data = data[['ticker', 'cik']]

# Ensure CIK is treated as an integer
data['cik'] = data['cik'].astype(int)

Expand Down Expand Up @@ -234,3 +237,13 @@ def get_icon_from_ticker(ticker: str) -> Optional[bytes]:
return None
else:
raise

def download_ticker_data(reference_data_directory: Path):
"""
Download reference data from the SEC website.
"""
log.info(f"Downloading ticker data to {reference_data_directory}")
download_datafile(ticker_txt_url, reference_data_directory)
download_datafile(company_tickers_json_url, reference_data_directory)
download_datafile(mutual_fund_tickers_url, reference_data_directory)
download_datafile(company_tickers_exchange_url, reference_data_directory)
55 changes: 37 additions & 18 deletions tests/test_core.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,16 @@
import datetime
import os
import tempfile
from datetime import datetime
from freezegun import freeze_time

import pandas as pd
import pyarrow as pa
import pyarrow.compute as pc
import pytest
from freezegun import freeze_time
from rich.table import Table

from pathlib import Path
import edgar
from edgar.richtools import *
from edgar.core import (decode_content,
get_identity,
set_identity,
Expand All @@ -24,7 +26,9 @@
reverse_name,
get_bool,
is_start_of_quarter,
split_camel_case)
split_camel_case,
download_edgar_data)
from edgar.richtools import *


def test_decode_content():
Expand Down Expand Up @@ -175,6 +179,7 @@ def test_filter_by_form():
assert len(filter_by_form(table, form=['10-K', '10-Q', '10-K/A'], amendments=False)) == 3
assert len(filter_by_form(table, form=['10-K', '10-Q', '10-K/A'], amendments=True)) == 4


def test_filter_by_cik():
arrays = [pa.array(['a', 'b', 'c', 'd', 'e']),
pa.array([3, 2, 1, 4, 4]),
Expand Down Expand Up @@ -267,6 +272,7 @@ def test_get_bool():
assert get_bool("TRUE")
assert get_bool("True")


def test_split_camel_case():
assert split_camel_case("CoverPage") == "Cover Page"
assert split_camel_case("CONSOLIDATEDBALANCESHEETS") == "CONSOLIDATEDBALANCESHEETS"
Expand All @@ -276,18 +282,18 @@ def test_split_camel_case():


@pytest.mark.parametrize("test_date, expected_result", [
("2024-01-01", True), # New Year's Day (start of Q1)
("2024-01-02", True), # First business day after New Year's
("2024-01-01", True), # New Year's Day (start of Q1)
("2024-01-02", True), # First business day after New Year's
("2024-01-03", False), # Second business day after New Year's
("2024-03-31", False), # Last day of Q1
("2024-04-01", True), # First day of Q2
("2024-04-02", True), # Possibly first business day of Q2
("2024-04-01", True), # First day of Q2
("2024-04-02", True), # Possibly first business day of Q2
("2024-04-03", False), # Second business day of Q2
("2024-07-01", True), # First day of Q3
("2024-07-02", True), # Possibly first business day of Q3
("2024-07-01", True), # First day of Q3
("2024-07-02", True), # Possibly first business day of Q3
("2024-07-03", False), # Second business day of Q3
("2024-10-01", True), # First day of Q4
("2024-10-02", True), # Possibly first business day of Q4
("2024-10-01", True), # First day of Q4
("2024-10-02", True), # Possibly first business day of Q4
("2024-10-03", False), # Second business day of Q4
("2024-12-31", False), # Last day of Q4
("2024-05-15", False), # Random day in middle of quarter
Expand All @@ -296,14 +302,27 @@ def test_is_start_of_quarter(test_date, expected_result):
with freeze_time(test_date):
assert is_start_of_quarter() == expected_result


@pytest.mark.parametrize("test_datetime, expected_result", [
("2024-01-01 00:00:01", True), # Just after midnight on New Year's
("2024-01-02 23:59:59", True), # Just before midnight on Jan 2
("2024-01-01 00:00:01", True), # Just after midnight on New Year's
("2024-01-02 23:59:59", True), # Just before midnight on Jan 2
("2024-01-03 00:00:01", False), # Just after midnight on Jan 3
("2024-04-01 12:00:00", True), # Noon on first day of Q2
("2024-07-01 18:30:00", True), # Evening on first day of Q3
("2024-10-02 09:00:00", True), # Morning of possibly first business day of Q4
("2024-04-01 12:00:00", True), # Noon on first day of Q2
("2024-07-01 18:30:00", True), # Evening on first day of Q3
("2024-10-02 09:00:00", True), # Morning of possibly first business day of Q4
])
def test_is_start_of_quarter_with_time(test_datetime, expected_result):
with freeze_time(test_datetime):
assert is_start_of_quarter() == expected_result
assert is_start_of_quarter() == expected_result


"""
def test_download_edgar_data(monkeypatch):
with tempfile.TemporaryDirectory() as d:
monkeypatch.setenv("EDGAR_LOCAL_DATA_DIR", d)
assert os.environ["EDGAR_LOCAL_DATA_DIR"] == d
download_edgar_data(submissions=False, facts=False, reference=True)
files = set(f.name for f in (Path(d) /"reference").glob("*"))
assert files & {'ticker.txt', 'company_tickers_exchange.json', 'company_tickers.json',
'company_tickers_mf.json'}
"""
22 changes: 14 additions & 8 deletions tests/test_reference.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
from edgar.reference import cusip_ticker_mapping, get_ticker_from_cusip, describe_form
from edgar.reference.tickers import get_cik_tickers, find_cik, get_company_ticker_name_exchange, \
get_companies_by_exchange, get_mutual_fund_tickers, find_mutual_fund_cik
import json
from unittest.mock import patch

import pandas as pd

from edgar.reference import cusip_ticker_mapping, get_ticker_from_cusip, describe_form
from edgar.reference.tickers import get_cik_tickers, find_cik, get_company_ticker_name_exchange, \
get_companies_by_exchange, get_mutual_fund_tickers, find_mutual_fund_cik, download_ticker_data


def test_cusip_ticker_mapping():
data = cusip_ticker_mapping()
assert data.loc['15101T102'].Ticker == 'CLXX'
Expand Down Expand Up @@ -120,16 +123,19 @@ def test_get_cik_tickers():
"1": {"cik_str": 789019, "ticker": "MSFT", "title": "MICROSOFT CORP"}
})
]

# Clear the lru_cache to ensure we're not getting cached results
get_cik_tickers.cache_clear()

fallback_data = get_cik_tickers()
assert isinstance(fallback_data, pd.DataFrame), "Fallback result should be a pandas DataFrame"
assert set(fallback_data.columns) == {'ticker', 'cik'}, f"Fallback columns should be 'ticker' and 'cik', got {fallback_data.columns}"
assert set(fallback_data.columns) == {'ticker',
'cik'}, f"Fallback columns should be 'ticker' and 'cik', got {fallback_data.columns}"
assert len(fallback_data) == 2, f"Fallback data should have 2 entries, got {len(fallback_data)}"
assert fallback_data['ticker'].tolist() == ['AAPL', 'MSFT'], f"Fallback tickers should be ['AAPL', 'MSFT'], got {fallback_data['ticker'].tolist()}"
assert fallback_data['cik'].tolist() == [320193, 789019], f"Fallback CIKs should be [320193, 789019], got {fallback_data['cik'].tolist()}"
assert fallback_data['ticker'].tolist() == ['AAPL',
'MSFT'], f"Fallback tickers should be ['AAPL', 'MSFT'], got {fallback_data['ticker'].tolist()}"
assert fallback_data['cik'].tolist() == [320193,
789019], f"Fallback CIKs should be [320193, 789019], got {fallback_data['cik'].tolist()}"

# Verify that download_file was called twice
assert mock_download.call_count == 2, f"download_file should be called twice, was called {mock_download.call_count} times"

0 comments on commit 3aa47df

Please sign in to comment.