Skip to content
This repository has been archived by the owner on Dec 21, 2023. It is now read-only.

Commit

Permalink
Bring code back to GitHub
Browse files Browse the repository at this point in the history
  • Loading branch information
markanewman committed Jan 18, 2018
1 parent d71981d commit 1b3201a
Show file tree
Hide file tree
Showing 15 changed files with 491 additions and 0 deletions.
5 changes: 5 additions & 0 deletions .gitattributes
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
* text=auto

*.rmd linguist-language=R

data/* linguist-vendored
19 changes: 19 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# Visual Studio
.vs/
*.suo
*.user
*.userosscache
*.userprefs
[Dd]ebug/
[Rr]elease/
[Rr]eleases/
x64/
x86/
bld/
[Bb]in/
[Oo]bj/
[Ll]og/
[Pp]ackages/

# WinMerge
*.bak
38 changes: 38 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
# Introduction

Pulls down all the SEC filings from [EDGAR][edgar] by filing type for a given time range.

# How to use

1. Open a command prompt
2. Change to the directory where you downloaded the files
3. Run `ScrapeSecEdgar.exe`

# Parameters

`ScrapeSecEdgar.exe` has 4 optional parameters

1. formtype
* The SEC form type. Found in the master.idx 'Form Type' column.
* Defaults to 10-K
* Alias: f
2. start
* The date to start the pull. Found in the master.idx 'Date Filed' column.
* Defaults to 1993/01/10.
* Alias: s
3. end
* The date to end the pull. Found in the master.idx 'Date Filed' column.
* Defaults to today.
* Alias: e
4. path
* The path to save files to.
* Defaults to %Desktop%/EDGAR
* Alias: p

```{shell}
ScrapeSecEdgar.exe -s 2017/08/05
```

---------

[edgar]: https://www.sec.gov/edgar.shtml
30 changes: 30 additions & 0 deletions ScrapeSecEdgar.sln
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@

Microsoft Visual Studio Solution File, Format Version 12.00
# Visual Studio 15
VisualStudioVersion = 15.0.27130.2010
MinimumVisualStudioVersion = 10.0.40219.1
Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Solution Items", "Solution Items", "{86035FFE-4DC9-4814-9D0B-C09FB290B927}"
ProjectSection(SolutionItems) = preProject
README.md = README.md
EndProjectSection
EndProject
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "ScrapeSecEdgar", "ScrapeSecEdgar\ScrapeSecEdgar.csproj", "{B6B34450-518C-4EBE-92A7-9C9FCE63B643}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|Any CPU = Debug|Any CPU
Release|Any CPU = Release|Any CPU
EndGlobalSection
GlobalSection(ProjectConfigurationPlatforms) = postSolution
{B6B34450-518C-4EBE-92A7-9C9FCE63B643}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{B6B34450-518C-4EBE-92A7-9C9FCE63B643}.Debug|Any CPU.Build.0 = Debug|Any CPU
{B6B34450-518C-4EBE-92A7-9C9FCE63B643}.Release|Any CPU.ActiveCfg = Release|Any CPU
{B6B34450-518C-4EBE-92A7-9C9FCE63B643}.Release|Any CPU.Build.0 = Release|Any CPU
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
EndGlobalSection
GlobalSection(ExtensibilityGlobals) = postSolution
SolutionGuid = {D6095D00-430F-4CF4-AC35-7B54E1D4CDEE}
EndGlobalSection
EndGlobal
6 changes: 6 additions & 0 deletions ScrapeSecEdgar/App.config
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
<?xml version="1.0" encoding="utf-8" ?>
<configuration>
<startup>
<supportedRuntime version="v4.0" sku=".NETFramework,Version=v4.7" />
</startup>
</configuration>
59 changes: 59 additions & 0 deletions ScrapeSecEdgar/Jobs/ScrapeJob.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
using ScrapeSecEdgar.Models;
using ScrapeSecEdgar.Tools;
using System;
using System.IO;
using System.Linq;
using System.Net.Http;
using System.Threading.Tasks;

namespace ScrapeSecEdgar.Jobs
{
class ScrapeJob
{
const string _indexPattern = "https://www.sec.gov/Archives/edgar/full-index/{0}/QTR{1}/master.zip";
EdgarConfiguration _config;

public ScrapeJob(EdgarConfiguration config)
{
_config = config ?? throw new ArgumentNullException(nameof(config));
}

public async Task ScrapeAsync()
{
Console.WriteLine($"Getting all the {_config.FormType} forms for {_config.Start.ToString("yyyy/MM/dd")} to {_config.End.ToString("yyyy/MM/dd")}");

var urlhelpres = new UrlHelpers(_config);

using (var client = new HttpClient())
foreach (var indexurl in urlhelpres.IndexList())
{
Console.WriteLine($"Downloading {indexurl}");
var indexfile = await client.DownloadAndUnzipIndexAsync(indexurl);
Console.WriteLine($"Parsing {indexfile}");
var filings = await (new IndexFile(indexfile)).GetFilingsAsync();
Directory.Delete(Path.GetDirectoryName(indexfile), true);

Console.WriteLine($"Processing {filings.Count} filings");
filings = filings
.Where(p => Keep(p))
.OrderBy(p => p.DateFiled)
.ToList();
Console.WriteLine($"{filings.Count} {_config.FormType}s found. Downloading...");

foreach (var filing in filings)
{
Console.WriteLine($"{filing.DateFiled.ToString("yyyy/MM/dd")} Downloading {filing.Filename}");
await client.DownloadFilingAsync(filing, _config.SaveLocation);
}
}
}

bool Keep(EdgarFiling filing)
{
return 1 == 1
&& _config.FormType == filing.FormType
&& _config.Start <= filing.DateFiled
&& filing.DateFiled <= _config.End;
}
}
}
36 changes: 36 additions & 0 deletions ScrapeSecEdgar/Models/EdgarConfiguration.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
using Mono.Options;
using System;
using System.IO;

namespace ScrapeSecEdgar.Models
{
class EdgarConfiguration
{
public string FormType { get; }
public DateTime Start { get; }
public DateTime End { get; }
public string SaveLocation { get; }

public EdgarConfiguration(string[] arrs)
{
var type = "10-K";
var start = "1993/01/01";
var end = DateTime.UtcNow.AddDays(1).ToString("yyyy/MM/dd");
var save = Path.Combine(Environment.GetFolderPath(Environment.SpecialFolder.Desktop), "EDGAR");

var options = new OptionSet {
{ "f|formtype=", "The SEC form type. Found in master.idx 'Form Type'. Defaults to all forms.", p => type = p },
{ "s|start=", "The date to start the pull. Found in master.idx 'Date Filed'. Defaults to 1993/01/10.", p => start = p },
{ "e|end=", "The date to end the pull. Found in master.idx 'Date Filed'. Defaults to today.", p => end = p },
{ "p|path=", "The path to save files to.", p => save = p }
};

var extras = options.Parse(arrs);

FormType = type;
Start = DateTime.Parse(start);
End = DateTime.Parse(end);
SaveLocation = save;
}
}
}
14 changes: 14 additions & 0 deletions ScrapeSecEdgar/Models/EdgarFiling.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
using System;

namespace ScrapeSecEdgar.Models
{
class EdgarFiling
{
public int CIK { get; set; }
public string CompanyName { get; set; }
public DateTime DateFiled { get; set; }
public string FormType { get; set; }
public string Filename { get; set; }
public string Url { get; set; }
}
}
30 changes: 30 additions & 0 deletions ScrapeSecEdgar/Program.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
using ScrapeSecEdgar.Jobs;
using ScrapeSecEdgar.Models;
using System;

namespace ScrapeSecEdgar
{
class Program
{
static void Main(string[] args)
{
try
{
var config = new EdgarConfiguration(args);
var job = new ScrapeJob(config);

job.ScrapeAsync().Wait();
}
catch (Exception ex)
{
Console.WriteLine(new string('-', 30));
Console.WriteLine(ex);
}
finally
{
Console.WriteLine("Done....");
Console.ReadLine();
}
}
}
}
36 changes: 36 additions & 0 deletions ScrapeSecEdgar/Properties/AssemblyInfo.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
using System.Reflection;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;

// General Information about an assembly is controlled through the following
// set of attributes. Change these attribute values to modify the information
// associated with an assembly.
[assembly: AssemblyTitle("ScrapeSecEdgar")]
[assembly: AssemblyDescription("")]
[assembly: AssemblyConfiguration("")]
[assembly: AssemblyCompany("")]
[assembly: AssemblyProduct("ScrapeSecEdgar")]
[assembly: AssemblyCopyright("Copyright © 2017")]
[assembly: AssemblyTrademark("")]
[assembly: AssemblyCulture("")]

// Setting ComVisible to false makes the types in this assembly not visible
// to COM components. If you need to access a type in this assembly from
// COM, set the ComVisible attribute to true on that type.
[assembly: ComVisible(false)]

// The following GUID is for the ID of the typelib if this project is exposed to COM
[assembly: Guid("b6b34450-518c-4ebe-92a7-9c9fce63b643")]

// Version information for an assembly consists of the following four values:
//
// Major Version
// Minor Version
// Build Number
// Revision
//
// You can specify all the values or you can default the Build and Revision Numbers
// by using the '*' as shown below:
// [assembly: AssemblyVersion("1.0.*")]
[assembly: AssemblyVersion("1.0.0.0")]
[assembly: AssemblyFileVersion("1.0.0.0")]
61 changes: 61 additions & 0 deletions ScrapeSecEdgar/ScrapeSecEdgar.csproj
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
<?xml version="1.0" encoding="utf-8"?>
<Project ToolsVersion="15.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<Import Project="$(MSBuildExtensionsPath)\$(MSBuildToolsVersion)\Microsoft.Common.props" Condition="Exists('$(MSBuildExtensionsPath)\$(MSBuildToolsVersion)\Microsoft.Common.props')" />
<PropertyGroup>
<Configuration Condition=" '$(Configuration)' == '' ">Debug</Configuration>
<Platform Condition=" '$(Platform)' == '' ">AnyCPU</Platform>
<ProjectGuid>{B6B34450-518C-4EBE-92A7-9C9FCE63B643}</ProjectGuid>
<OutputType>Exe</OutputType>
<RootNamespace>ScrapeSecEdgar</RootNamespace>
<AssemblyName>ScrapeSecEdgar</AssemblyName>
<TargetFrameworkVersion>v4.7</TargetFrameworkVersion>
<FileAlignment>512</FileAlignment>
<AutoGenerateBindingRedirects>true</AutoGenerateBindingRedirects>
<OutputPath>bin\$(Configuration)\</OutputPath>
<ErrorReport>prompt</ErrorReport>
<WarningLevel>4</WarningLevel>
<PlatformTarget>AnyCPU</PlatformTarget>
</PropertyGroup>
<PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Debug|AnyCPU' ">
<DebugSymbols>true</DebugSymbols>
<DebugType>full</DebugType>
<Optimize>false</Optimize>
<DefineConstants>DEBUG;TRACE</DefineConstants>
</PropertyGroup>
<PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Release|AnyCPU' ">
<Optimize>true</Optimize>
<DefineConstants>TRACE</DefineConstants>
</PropertyGroup>
<ItemGroup>
<Reference Include="Mono.Options, Version=5.0.0.0, Culture=neutral, processorArchitecture=MSIL">
<HintPath>..\packages\Mono.Options.5.3.0.1\lib\net4-client\Mono.Options.dll</HintPath>
</Reference>
<Reference Include="Polly, Version=5.6.1.0, Culture=neutral, processorArchitecture=MSIL">
<HintPath>..\packages\Polly.5.6.1\lib\net45\Polly.dll</HintPath>
</Reference>
<Reference Include="System" />
<Reference Include="System.Core" />
<Reference Include="System.IO.Compression.FileSystem" />
<Reference Include="System.Xml.Linq" />
<Reference Include="System.Data.DataSetExtensions" />
<Reference Include="Microsoft.CSharp" />
<Reference Include="System.Data" />
<Reference Include="System.Net.Http" />
<Reference Include="System.Xml" />
</ItemGroup>
<ItemGroup>
<Compile Include="Models\EdgarConfiguration.cs" />
<Compile Include="Models\EdgarFiling.cs" />
<Compile Include="Program.cs" />
<Compile Include="Properties\AssemblyInfo.cs" />
<Compile Include="Jobs\ScrapeJob.cs" />
<Compile Include="Tools\HttpClientExtensions.cs" />
<Compile Include="Tools\IndexFile.cs" />
<Compile Include="Tools\UrlHelpers.cs" />
</ItemGroup>
<ItemGroup>
<None Include="App.config" />
<None Include="packages.config" />
</ItemGroup>
<Import Project="$(MSBuildToolsPath)\Microsoft.CSharp.targets" />
</Project>
49 changes: 49 additions & 0 deletions ScrapeSecEdgar/Tools/HttpClientExtensions.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
using Polly;
using ScrapeSecEdgar.Models;
using System;
using System.IO;
using System.IO.Compression;
using System.Net.Http;
using System.Threading.Tasks;

namespace ScrapeSecEdgar.Tools
{
static class HttpClientExtensions
{
public static async Task<string> DownloadAndUnzipIndexAsync(this HttpClient client, string url)
{
var tmpfile = Path.GetTempFileName();
var unzippath = Path.Combine(Path.GetTempPath(), Guid.NewGuid().ToString("N"));

await Policy.Handle<Exception>()
.WaitAndRetryAsync(5, retryAttempt => TimeSpan.FromSeconds(Math.Pow(2, retryAttempt)))
.ExecuteAsync(async () =>
{
using (var streamin = await client.GetStreamAsync(url))
using (var streamout = new FileStream(tmpfile, FileMode.Open, FileAccess.Write))
await streamin.CopyToAsync(streamout);
});

ZipFile.ExtractToDirectory(tmpfile, unzippath);
File.Delete(tmpfile);

return Path.Combine(unzippath, "master.idx");
}
public static async Task DownloadFilingAsync(this HttpClient client, EdgarFiling filing, string saveLocation)
{
var dir = Path.Combine(saveLocation, filing.CIK.ToString());// filing.FormType, );
Directory.CreateDirectory(dir);
var file = Path.Combine(dir, $"{filing.DateFiled.ToString("yyyyMMdd")}.{filing.FormType}{Path.GetExtension(filing.Filename)}");

await Policy.Handle<Exception>()
.WaitAndRetryAsync(5, retryAttempt => TimeSpan.FromSeconds(Math.Pow(2, retryAttempt)))
.ExecuteAsync(async () =>
{
using (var streamin = await client.GetStreamAsync(filing.Url))
using (var streamout = new FileStream(file, FileMode.Create, FileAccess.Write))
await streamin.CopyToAsync(streamout);
});

}
}
}
Loading

0 comments on commit 1b3201a

Please sign in to comment.