diff --git a/.gitignore b/.gitignore index feae862..851f135 100644 --- a/.gitignore +++ b/.gitignore @@ -1,16 +1,20 @@ .DS_Store + data/* +code/downloader/test/* +code/downloader/logs/* + +**/login.auth **/*.auth + *.pyc *.key noacri.db **/.ipynb_checkpoints/* **/_examples/* **/_misc.ipynb -**/login.auth **/*.auth .vscode/* **/_temp_/* -code/downloader/test/* **/geckodriver.log -node_modules/* \ No newline at end of file +node_modules/* diff --git a/README.md b/README.md index b1f97ca..1d5b2dc 100644 --- a/README.md +++ b/README.md @@ -25,11 +25,23 @@ This is a demo version of PACER with demo credentials that can be used for free. ## 1. Getting Started - Clone this PACER-tools repo - - Install any missing python packages + - Install any missing python packages (see below) - Make sure you have a recent version of Firefox installed (80.0+) and [GeckoDriver](https://github.com/mozilla/geckodriver) for Firefox -**Download folder** -For this tutorial we are going to put our data into */data/pacer*. The scraper separates out data by district, so it's best to have a subdirectory for each district, named by court abbreviation (e.g. */data/pacer/ilnd* for Northern District of Illinois). When the scraper runs it will build the necessary structure inside of that subdirectory that it needs to download and house the data from Pacer. +### Installing packages + +This project was originally developed for use with the [Anaconda](https://www.anaconda.com) python distribution. + +Anaconda users can create a new project using `conda install -n PACER-tools requirements.txt`; change *PACER-tools* to any other project name if you prefer. + +If you are not an Anaconda user, you can install the python requirements for this project using `pip install -r pip-requirements.txt`; we recommend creating a separate virtual environment for this project. The reference Python used for development is `3.8.5` but this should work with any `3.8.x` and probably with any version of Python 3. + +Anaconda also manages installation of many non-python dependencies. Non-Anaconda users may have to install these themselves, in ways which may vary from platform to platform. A list of the non-python dependencies is included, in a series of commented lines, at the end of `pip-requirements.txt` but they may not all be needed, and installing them will vary. If you use PACER-tools without using Anaconda, we welcome help improving the documentation about what needs to be installed, and, to the extent possible, how it should be installed. + + + +### Download folder +For this tutorial we are going to put our data into *data/pacer*. The scraper separates out data by district, so it's best to have a subdirectory for each district, named by court abbreviation (e.g. *data/pacer/ilnd* for Northern District of Illinois). When the scraper runs it will build the necessary structure inside of that subdirectory that it needs to download and house the data from Pacer. Since we are using the PACER demo, we will use the court abbreviation it uses which is *psc* (for PACER Service Centre). The scraper will take an `inpath` argument, to which we will pass */data/pacer/psc*. @@ -61,7 +73,7 @@ The user will be prompted for the following: *Note*: *All of these parameters that the user was prompted for can actually be given as arguments to the script. These are all explained in full in the documentation. To avoid the prompting you can instead run:* - python scrapers.py --override-time --query-conf demo.json -m query -c psc -a login/demo.auth -cl 50 data/pacer/psc + python scrapers.py --override-time --query-conf demo.json -m query -c psc -a login/demo.auth -cl 50 ../../data/pacer/psc **Result** diff --git a/pip-requirements.txt b/pip-requirements.txt new file mode 100644 index 0000000..a30adf3 --- /dev/null +++ b/pip-requirements.txt @@ -0,0 +1,217 @@ +alabaster==0.7.12 +applaunchservices==0.2.1 +appnope==0.1.0 +appscript==1.1.0 +argh==0.26.2 +asn1crypto==1.3.0 +astroid==2.3.3 +astropy==4.0 +atomicwrites==1.3.0 +attrs==19.3.0 +autopep8==1.4.4 +babel==2.8.0 +backcall==0.1.0 +beautifulsoup4==4.9.3 +bitarray==1.2.1 +bkcharts==0.2 +bleach==3.1.0 +bokeh==1.4.0 +boto==2.49.0 +bottleneck==1.3.2 +brotlipy==0.7.0 +certifi==2020.6.20 +cffi==1.14.3 +chardet==3.0.4 +click==7.1.2 +cloudpickle==1.3.0 +colorama==0.4.3 +contextlib2==0.6.0.post1 +cryptography==3.2.1 +cycler==0.10.0 +cython==0.29.15 +cytoolz==0.10.1 +dask==2.11.0 +decorator==4.4.1 +defusedxml==0.6.0 +diff-match-patch==20181111 +distributed==2.11.0 +docutils==0.16 +entrypoints==0.3 +et_xmlfile==1.0.1 +fastcache==1.1.0 +filelock==3.0.12 +flake8==3.7.9 +flask==1.1.1 +fsspec==0.6.2 +future==0.18.2 +gevent==1.4.0 +glob2==0.7 +gmpy2==2.0.8 +greenlet==0.4.15 +h5py==2.10.0 +heapdict==1.0.1 +html5lib==1.0.1 +hypothesis==5.5.4 +idna==2.10 +imageio==2.6.1 +imagesize==1.2.0 +importlib_metadata==1.5.0 +intervaltree==3.0.2 +ipykernel==5.1.4 +ipython==7.12.0 +ipython_genutils==0.2.0 +ipywidgets==7.5.1 +isort==4.3.21 +itsdangerous==1.1.0 +jdcal==1.4.1 +jedi==0.14.1 +jinja2==2.11.1 +joblib==0.14.1 +json5==0.9.1 +jsonschema==3.2.0 +jupyter==1.0.0 +jupyter_client==5.3.4 +jupyter_console==6.1.0 +jupyter_core==4.6.1 +jupyterlab==1.2.6 +jupyterlab_server==1.0.6 +keyring==21.1.0 +kiwisolver==1.1.0 +lazy-object-proxy==1.4.3 +llvmlite==0.31.0 +locket==0.2.0 +lxml==4.5.0 +markupsafe==1.1.1 +matplotlib==3.1.3 +mccabe==0.6.1 +mistune==0.8.4 +mock==4.0.1 +more-itertools==8.2.0 +mpmath==1.1.0 +multipledispatch==0.6.0 +nbconvert==5.6.1 +nbformat==5.0.4 +networkx==2.4 +nltk==3.4.5 +nose==1.3.7 +notebook==6.0.3 +numba==0.48.0 +numexpr==2.7.1 +numpy==1.19.2 +numpydoc==0.9.2 +olefile==0.46 +openpyxl==3.0.3 +packaging==20.1 +pandas==1.1.3 +pandocfilters==1.4.2 +parso==0.5.2 +partd==1.1.0 +path==13.1.0 +path.py==12.4.0 +pathlib2==2.3.5 +pathtools==0.1.2 +patsy==0.5.1 +pep8==1.7.1 +pexpect==4.8.0 +pickleshare==0.7.5 +pillow==7.0.0 +pip==20.2.4 +pkginfo==1.5.0.1 +pluggy==0.13.1 +ply==3.11 +prometheus_client==0.7.1 +prompt_toolkit==3.0.3 +psutil==5.6.7 +ptyprocess==0.6.0 +py==1.8.1 +pycodestyle==2.5.0 +pycosat==0.6.3 +pycparser==2.20 +pycrypto==2.6.1 +pycurl==7.43.0.5 +pydocstyle==4.0.1 +pyflakes==2.1.1 +pygments==2.5.2 +pylint==2.4.4 +pyodbc==4.0.30 +pyopenssl==19.1.0 +pyparsing==2.4.6 +pyrsistent==0.15.7 +pysocks==1.7.1 +pytest==5.3.5 +pytest-arraydiff==0.3 +pytest-astropy==0.8.0 +pytest-astropy-header==0.1.2 +pytest-doctestplus==0.5.0 +pytest-openfiles==0.4.0 +pytest-remotedata==0.3.2 +python-dateutil==2.8.1 +python-jsonrpc-server==0.3.4 +python-language-server==0.31.7 +pytz==2020.1 +pywavelets==1.1.1 +pyyaml==5.3 +pyzmq==18.1.1 +qdarkstyle==2.8 +qtawesome==0.6.1 +qtconsole==4.6.0 +qtpy==1.9.0 +requests==2.25.1 +rope==0.16.0 +rtree==0.9.3 +ruamel_yaml==0.15.87 +scikit-image==0.16.2 +scikit-learn==0.22.1 +scipy==1.4.1 +seaborn==0.10.0 +selenium==3.141.0 +send2trash==1.5.0 +setuptools==50.3.1 +simplegeneric==0.8.1 +simplejson==3.17.0 +singledispatch==3.4.0.3 +six==1.15.0 +snowballstemmer==2.0.0 +sortedcollections==1.1.2 +sortedcontainers==2.1.0 +soupsieve==2.0.1 +sphinx==2.4.0 +sphinxcontrib-applehelp==1.0.1 +sphinxcontrib-devhelp==1.0.1 +sphinxcontrib-htmlhelp==1.0.2 +sphinxcontrib-jsmath==1.0.1 +sphinxcontrib-qthelp==1.0.2 +sphinxcontrib-serializinghtml==1.1.3 +sphinxcontrib-websupport==1.2.0 +spyder==4.0.1 +spyder-kernels==1.8.1 +sqlalchemy==1.3.13 +statsmodels==0.11.0 +sympy==1.5.1 +tblib==1.6.0 +terminado==0.8.3 +testpath==0.4.4 +toolz==0.10.0 +tornado==6.0.3 +tqdm==4.51.0 +traitlets==4.3.3 +ujson==1.35 +unicodecsv==0.14.1 +urllib3==1.25.11 +watchdog==0.10.2 +wcwidth==0.1.8 +webencodings==0.5.1 +werkzeug==1.0.0 +wheel==0.35.1 +widgetsnbextension==3.5.1 +wrapt==1.11.2 +wurlitzer==2.0.0 +xlrd==1.2.0 +xlsxwriter==1.2.7 +xlwings==0.17.1 +xlwt==1.3.0 +xmltodict==0.12.0 +yapf==0.28.0 +zict==1.0.0 +zipp==2.2.0 +selenium-requests==1.3