From 1b3201a2f6c8b2c4dd8eb856aeba15dbc060246d Mon Sep 17 00:00:00 2001 From: Mark Newman Date: Thu, 18 Jan 2018 17:11:24 -0500 Subject: [PATCH] Bring code back to GitHub --- .gitattributes | 5 ++ .gitignore | 19 ++++++ README.md | 38 +++++++++++ ScrapeSecEdgar.sln | 30 +++++++++ ScrapeSecEdgar/App.config | 6 ++ ScrapeSecEdgar/Jobs/ScrapeJob.cs | 59 +++++++++++++++++ ScrapeSecEdgar/Models/EdgarConfiguration.cs | 36 +++++++++++ ScrapeSecEdgar/Models/EdgarFiling.cs | 14 +++++ ScrapeSecEdgar/Program.cs | 30 +++++++++ ScrapeSecEdgar/Properties/AssemblyInfo.cs | 36 +++++++++++ ScrapeSecEdgar/ScrapeSecEdgar.csproj | 61 ++++++++++++++++++ ScrapeSecEdgar/Tools/HttpClientExtensions.cs | 49 +++++++++++++++ ScrapeSecEdgar/Tools/IndexFile.cs | 66 ++++++++++++++++++++ ScrapeSecEdgar/Tools/UrlHelpers.cs | 37 +++++++++++ ScrapeSecEdgar/packages.config | 5 ++ 15 files changed, 491 insertions(+) create mode 100644 .gitattributes create mode 100644 .gitignore create mode 100644 README.md create mode 100644 ScrapeSecEdgar.sln create mode 100644 ScrapeSecEdgar/App.config create mode 100644 ScrapeSecEdgar/Jobs/ScrapeJob.cs create mode 100644 ScrapeSecEdgar/Models/EdgarConfiguration.cs create mode 100644 ScrapeSecEdgar/Models/EdgarFiling.cs create mode 100644 ScrapeSecEdgar/Program.cs create mode 100644 ScrapeSecEdgar/Properties/AssemblyInfo.cs create mode 100644 ScrapeSecEdgar/ScrapeSecEdgar.csproj create mode 100644 ScrapeSecEdgar/Tools/HttpClientExtensions.cs create mode 100644 ScrapeSecEdgar/Tools/IndexFile.cs create mode 100644 ScrapeSecEdgar/Tools/UrlHelpers.cs create mode 100644 ScrapeSecEdgar/packages.config diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..245676c --- /dev/null +++ b/.gitattributes @@ -0,0 +1,5 @@ +* text=auto + +*.rmd linguist-language=R + +data/* linguist-vendored \ No newline at end of file diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..73fa7de --- /dev/null +++ b/.gitignore @@ -0,0 +1,19 @@ +# Visual Studio +.vs/ +*.suo +*.user +*.userosscache +*.userprefs +[Dd]ebug/ +[Rr]elease/ +[Rr]eleases/ +x64/ +x86/ +bld/ +[Bb]in/ +[Oo]bj/ +[Ll]og/ +[Pp]ackages/ + +# WinMerge +*.bak \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..63304c2 --- /dev/null +++ b/README.md @@ -0,0 +1,38 @@ +# Introduction + +Pulls down all the SEC filings from [EDGAR][edgar] by filing type for a given time range. + +# How to use + +1. Open a command prompt +2. Change to the directory where you downloaded the files +3. Run `ScrapeSecEdgar.exe` + +# Parameters + +`ScrapeSecEdgar.exe` has 4 optional parameters + +1. formtype + * The SEC form type. Found in the master.idx 'Form Type' column. + * Defaults to 10-K + * Alias: f +2. start + * The date to start the pull. Found in the master.idx 'Date Filed' column. + * Defaults to 1993/01/10. + * Alias: s +3. end + * The date to end the pull. Found in the master.idx 'Date Filed' column. + * Defaults to today. + * Alias: e +4. path + * The path to save files to. + * Defaults to %Desktop%/EDGAR + * Alias: p + +```{shell} +ScrapeSecEdgar.exe -s 2017/08/05 +``` + +--------- + +[edgar]: https://www.sec.gov/edgar.shtml diff --git a/ScrapeSecEdgar.sln b/ScrapeSecEdgar.sln new file mode 100644 index 0000000..7b2ae75 --- /dev/null +++ b/ScrapeSecEdgar.sln @@ -0,0 +1,30 @@ + +Microsoft Visual Studio Solution File, Format Version 12.00 +# Visual Studio 15 +VisualStudioVersion = 15.0.27130.2010 +MinimumVisualStudioVersion = 10.0.40219.1 +Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Solution Items", "Solution Items", "{86035FFE-4DC9-4814-9D0B-C09FB290B927}" + ProjectSection(SolutionItems) = preProject + README.md = README.md + EndProjectSection +EndProject +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "ScrapeSecEdgar", "ScrapeSecEdgar\ScrapeSecEdgar.csproj", "{B6B34450-518C-4EBE-92A7-9C9FCE63B643}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|Any CPU = Debug|Any CPU + Release|Any CPU = Release|Any CPU + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {B6B34450-518C-4EBE-92A7-9C9FCE63B643}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {B6B34450-518C-4EBE-92A7-9C9FCE63B643}.Debug|Any CPU.Build.0 = Debug|Any CPU + {B6B34450-518C-4EBE-92A7-9C9FCE63B643}.Release|Any CPU.ActiveCfg = Release|Any CPU + {B6B34450-518C-4EBE-92A7-9C9FCE63B643}.Release|Any CPU.Build.0 = Release|Any CPU + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection + GlobalSection(ExtensibilityGlobals) = postSolution + SolutionGuid = {D6095D00-430F-4CF4-AC35-7B54E1D4CDEE} + EndGlobalSection +EndGlobal diff --git a/ScrapeSecEdgar/App.config b/ScrapeSecEdgar/App.config new file mode 100644 index 0000000..016d28f --- /dev/null +++ b/ScrapeSecEdgar/App.config @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/ScrapeSecEdgar/Jobs/ScrapeJob.cs b/ScrapeSecEdgar/Jobs/ScrapeJob.cs new file mode 100644 index 0000000..b3e2b3d --- /dev/null +++ b/ScrapeSecEdgar/Jobs/ScrapeJob.cs @@ -0,0 +1,59 @@ +using ScrapeSecEdgar.Models; +using ScrapeSecEdgar.Tools; +using System; +using System.IO; +using System.Linq; +using System.Net.Http; +using System.Threading.Tasks; + +namespace ScrapeSecEdgar.Jobs +{ + class ScrapeJob + { + const string _indexPattern = "https://www.sec.gov/Archives/edgar/full-index/{0}/QTR{1}/master.zip"; + EdgarConfiguration _config; + + public ScrapeJob(EdgarConfiguration config) + { + _config = config ?? throw new ArgumentNullException(nameof(config)); + } + + public async Task ScrapeAsync() + { + Console.WriteLine($"Getting all the {_config.FormType} forms for {_config.Start.ToString("yyyy/MM/dd")} to {_config.End.ToString("yyyy/MM/dd")}"); + + var urlhelpres = new UrlHelpers(_config); + + using (var client = new HttpClient()) + foreach (var indexurl in urlhelpres.IndexList()) + { + Console.WriteLine($"Downloading {indexurl}"); + var indexfile = await client.DownloadAndUnzipIndexAsync(indexurl); + Console.WriteLine($"Parsing {indexfile}"); + var filings = await (new IndexFile(indexfile)).GetFilingsAsync(); + Directory.Delete(Path.GetDirectoryName(indexfile), true); + + Console.WriteLine($"Processing {filings.Count} filings"); + filings = filings + .Where(p => Keep(p)) + .OrderBy(p => p.DateFiled) + .ToList(); + Console.WriteLine($"{filings.Count} {_config.FormType}s found. Downloading..."); + + foreach (var filing in filings) + { + Console.WriteLine($"{filing.DateFiled.ToString("yyyy/MM/dd")} Downloading {filing.Filename}"); + await client.DownloadFilingAsync(filing, _config.SaveLocation); + } + } + } + + bool Keep(EdgarFiling filing) + { + return 1 == 1 + && _config.FormType == filing.FormType + && _config.Start <= filing.DateFiled + && filing.DateFiled <= _config.End; + } + } +} diff --git a/ScrapeSecEdgar/Models/EdgarConfiguration.cs b/ScrapeSecEdgar/Models/EdgarConfiguration.cs new file mode 100644 index 0000000..c8a928d --- /dev/null +++ b/ScrapeSecEdgar/Models/EdgarConfiguration.cs @@ -0,0 +1,36 @@ +using Mono.Options; +using System; +using System.IO; + +namespace ScrapeSecEdgar.Models +{ + class EdgarConfiguration + { + public string FormType { get; } + public DateTime Start { get; } + public DateTime End { get; } + public string SaveLocation { get; } + + public EdgarConfiguration(string[] arrs) + { + var type = "10-K"; + var start = "1993/01/01"; + var end = DateTime.UtcNow.AddDays(1).ToString("yyyy/MM/dd"); + var save = Path.Combine(Environment.GetFolderPath(Environment.SpecialFolder.Desktop), "EDGAR"); + + var options = new OptionSet { + { "f|formtype=", "The SEC form type. Found in master.idx 'Form Type'. Defaults to all forms.", p => type = p }, + { "s|start=", "The date to start the pull. Found in master.idx 'Date Filed'. Defaults to 1993/01/10.", p => start = p }, + { "e|end=", "The date to end the pull. Found in master.idx 'Date Filed'. Defaults to today.", p => end = p }, + { "p|path=", "The path to save files to.", p => save = p } + }; + + var extras = options.Parse(arrs); + + FormType = type; + Start = DateTime.Parse(start); + End = DateTime.Parse(end); + SaveLocation = save; + } + } +} diff --git a/ScrapeSecEdgar/Models/EdgarFiling.cs b/ScrapeSecEdgar/Models/EdgarFiling.cs new file mode 100644 index 0000000..9cef6e5 --- /dev/null +++ b/ScrapeSecEdgar/Models/EdgarFiling.cs @@ -0,0 +1,14 @@ +using System; + +namespace ScrapeSecEdgar.Models +{ + class EdgarFiling + { + public int CIK { get; set; } + public string CompanyName { get; set; } + public DateTime DateFiled { get; set; } + public string FormType { get; set; } + public string Filename { get; set; } + public string Url { get; set; } + } +} diff --git a/ScrapeSecEdgar/Program.cs b/ScrapeSecEdgar/Program.cs new file mode 100644 index 0000000..3f8bd5b --- /dev/null +++ b/ScrapeSecEdgar/Program.cs @@ -0,0 +1,30 @@ +using ScrapeSecEdgar.Jobs; +using ScrapeSecEdgar.Models; +using System; + +namespace ScrapeSecEdgar +{ + class Program + { + static void Main(string[] args) + { + try + { + var config = new EdgarConfiguration(args); + var job = new ScrapeJob(config); + + job.ScrapeAsync().Wait(); + } + catch (Exception ex) + { + Console.WriteLine(new string('-', 30)); + Console.WriteLine(ex); + } + finally + { + Console.WriteLine("Done...."); + Console.ReadLine(); + } + } + } +} diff --git a/ScrapeSecEdgar/Properties/AssemblyInfo.cs b/ScrapeSecEdgar/Properties/AssemblyInfo.cs new file mode 100644 index 0000000..faf1cf3 --- /dev/null +++ b/ScrapeSecEdgar/Properties/AssemblyInfo.cs @@ -0,0 +1,36 @@ +using System.Reflection; +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; + +// General Information about an assembly is controlled through the following +// set of attributes. Change these attribute values to modify the information +// associated with an assembly. +[assembly: AssemblyTitle("ScrapeSecEdgar")] +[assembly: AssemblyDescription("")] +[assembly: AssemblyConfiguration("")] +[assembly: AssemblyCompany("")] +[assembly: AssemblyProduct("ScrapeSecEdgar")] +[assembly: AssemblyCopyright("Copyright © 2017")] +[assembly: AssemblyTrademark("")] +[assembly: AssemblyCulture("")] + +// Setting ComVisible to false makes the types in this assembly not visible +// to COM components. If you need to access a type in this assembly from +// COM, set the ComVisible attribute to true on that type. +[assembly: ComVisible(false)] + +// The following GUID is for the ID of the typelib if this project is exposed to COM +[assembly: Guid("b6b34450-518c-4ebe-92a7-9c9fce63b643")] + +// Version information for an assembly consists of the following four values: +// +// Major Version +// Minor Version +// Build Number +// Revision +// +// You can specify all the values or you can default the Build and Revision Numbers +// by using the '*' as shown below: +// [assembly: AssemblyVersion("1.0.*")] +[assembly: AssemblyVersion("1.0.0.0")] +[assembly: AssemblyFileVersion("1.0.0.0")] diff --git a/ScrapeSecEdgar/ScrapeSecEdgar.csproj b/ScrapeSecEdgar/ScrapeSecEdgar.csproj new file mode 100644 index 0000000..d3438a3 --- /dev/null +++ b/ScrapeSecEdgar/ScrapeSecEdgar.csproj @@ -0,0 +1,61 @@ + + + + + Debug + AnyCPU + {B6B34450-518C-4EBE-92A7-9C9FCE63B643} + Exe + ScrapeSecEdgar + ScrapeSecEdgar + v4.7 + 512 + true + bin\$(Configuration)\ + prompt + 4 + AnyCPU + + + true + full + false + DEBUG;TRACE + + + true + TRACE + + + + ..\packages\Mono.Options.5.3.0.1\lib\net4-client\Mono.Options.dll + + + ..\packages\Polly.5.6.1\lib\net45\Polly.dll + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/ScrapeSecEdgar/Tools/HttpClientExtensions.cs b/ScrapeSecEdgar/Tools/HttpClientExtensions.cs new file mode 100644 index 0000000..f652851 --- /dev/null +++ b/ScrapeSecEdgar/Tools/HttpClientExtensions.cs @@ -0,0 +1,49 @@ +using Polly; +using ScrapeSecEdgar.Models; +using System; +using System.IO; +using System.IO.Compression; +using System.Net.Http; +using System.Threading.Tasks; + +namespace ScrapeSecEdgar.Tools +{ + static class HttpClientExtensions + { + public static async Task DownloadAndUnzipIndexAsync(this HttpClient client, string url) + { + var tmpfile = Path.GetTempFileName(); + var unzippath = Path.Combine(Path.GetTempPath(), Guid.NewGuid().ToString("N")); + + await Policy.Handle() + .WaitAndRetryAsync(5, retryAttempt => TimeSpan.FromSeconds(Math.Pow(2, retryAttempt))) + .ExecuteAsync(async () => + { + using (var streamin = await client.GetStreamAsync(url)) + using (var streamout = new FileStream(tmpfile, FileMode.Open, FileAccess.Write)) + await streamin.CopyToAsync(streamout); + }); + + ZipFile.ExtractToDirectory(tmpfile, unzippath); + File.Delete(tmpfile); + + return Path.Combine(unzippath, "master.idx"); + } + public static async Task DownloadFilingAsync(this HttpClient client, EdgarFiling filing, string saveLocation) + { + var dir = Path.Combine(saveLocation, filing.CIK.ToString());// filing.FormType, ); + Directory.CreateDirectory(dir); + var file = Path.Combine(dir, $"{filing.DateFiled.ToString("yyyyMMdd")}.{filing.FormType}{Path.GetExtension(filing.Filename)}"); + + await Policy.Handle() + .WaitAndRetryAsync(5, retryAttempt => TimeSpan.FromSeconds(Math.Pow(2, retryAttempt))) + .ExecuteAsync(async () => + { + using (var streamin = await client.GetStreamAsync(filing.Url)) + using (var streamout = new FileStream(file, FileMode.Create, FileAccess.Write)) + await streamin.CopyToAsync(streamout); + }); + + } + } +} diff --git a/ScrapeSecEdgar/Tools/IndexFile.cs b/ScrapeSecEdgar/Tools/IndexFile.cs new file mode 100644 index 0000000..adbc53e --- /dev/null +++ b/ScrapeSecEdgar/Tools/IndexFile.cs @@ -0,0 +1,66 @@ +using ScrapeSecEdgar.Models; +using System; +using System.Collections.Generic; +using System.IO; +using System.Threading.Tasks; + +namespace ScrapeSecEdgar.Tools +{ + class IndexFile + { + const string _httpRootIdentifier = "Cloud HTTP:"; + const string _headerSeperatorIdentifier = "-------"; + + string _idxFile; + + public IndexFile(string idxFile) + { + _idxFile = idxFile ?? throw new ArgumentNullException(nameof(idxFile)); + } + + public async Task> GetFilingsAsync() + { + var result = new List(); + string httproot = null; + + using (var reader = new StreamReader(_idxFile)) + { + string line; + while ((line = await reader.ReadLineAsync()) != null) + if (line.StartsWith(_httpRootIdentifier)) + { + /// https://www.sec.gov/Archives/ + httproot = line.Substring(_httpRootIdentifier.Length).Trim(); + break; + } + + do + { + line = await reader.ReadLineAsync(); + } + while (line != null && !line.StartsWith(_headerSeperatorIdentifier)); + + while ((line = await reader.ReadLineAsync()) != null) + result.Add(Parse(line, httproot)); + } + + return result; + } + + static EdgarFiling Parse(string line, string httproot) + { + /// 1000032|BINCH JAMES G|4|2017-12-04|edgar/data/1000032/0000913165-17-000048.txt + var seg = line.Split('|'); + + return new EdgarFiling + { + CIK = int.Parse(seg[0]), + CompanyName = seg[1], + FormType = seg[2], + DateFiled = DateTime.Parse(seg[3]), + Filename = seg[4], + Url = httproot + seg[4] + }; + } + } +} diff --git a/ScrapeSecEdgar/Tools/UrlHelpers.cs b/ScrapeSecEdgar/Tools/UrlHelpers.cs new file mode 100644 index 0000000..dc6d519 --- /dev/null +++ b/ScrapeSecEdgar/Tools/UrlHelpers.cs @@ -0,0 +1,37 @@ +using ScrapeSecEdgar.Models; +using System; +using System.Collections.Generic; + +namespace ScrapeSecEdgar.Tools +{ + class UrlHelpers + { + const string _indexPattern = "https://www.sec.gov/Archives/edgar/full-index/{0}/QTR{1}/master.zip"; + EdgarConfiguration _config; + + public UrlHelpers(EdgarConfiguration config) + { + _config = config ?? throw new ArgumentNullException(nameof(config)); + } + + public IEnumerable IndexList() + { + var tmp = _config.Start; + + while (tmp < _config.End) + { + yield return string.Format(_indexPattern, tmp.Year, MonthToQuarter(tmp.Month)); + tmp = tmp.AddMonths(3); + } + } + int MonthToQuarter(int month) + { + if ((1 <= month) && (month <= 3)) return 1; + else if ((4 <= month) && (month <= 6)) return 2; + else if ((7 <= month) && (month <= 9)) return 3; + else if ((10 <= month) && (month <= 12)) return 4; + else + throw new ArgumentOutOfRangeException(nameof(month)); + } + } +} diff --git a/ScrapeSecEdgar/packages.config b/ScrapeSecEdgar/packages.config new file mode 100644 index 0000000..73524a7 --- /dev/null +++ b/ScrapeSecEdgar/packages.config @@ -0,0 +1,5 @@ + + + + + \ No newline at end of file