forked from doolin/patentprocessor
-
Notifications
You must be signed in to change notification settings - Fork 31
/
Copy pathprocess.cfg
executable file
·94 lines (82 loc) · 3.86 KB
/
process.cfg
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
# This is a sample file that configures the environment for the preprocessing
# steps of parsing, cleaning, consolidation
# [process] defines which configured steps the current run of the preprocessor
# will be run. Accepts 4 options:
# parse: defines which parsing configuration will be run
# clean: if True, runs the cleaning step on the output of parse
# consolidate: if True, runs the conslidation step on the output of clean
# doctype: can be grant, application, or all, and processing will proceed accordingly.
# Note: make sure that the value for grantregex and/or applicationregex
# is defined if you wish to use a value other than the default for either
[process]
parse=download
clean=True
consolidate=True
doctype=all
#[defaultparse]
## 'datadir' specifies the path to the directory containing the XML files that
## we want to parse. This path will be evaluated relative to the main directory
## of preprocessor. Defaults to '/data/patentdata/patents/2013'
#
# datadir=/path/to/patent/data
## 'grantregex' and 'applicationregex' specify the regular expression that
## matches the XML files that we want to parse. If you are downloading data
## from the USPTO, then the default value should be fine. Defaults to
## 'ipg\d{6}.xml', the format found for most USPTO files since 2005
#
# grantregex=ipg\d{6}.xml
# applicationregex=ipa\d{6}.xml
## 'years' specifies the range of years for which you want to download and
## parse. If the current year is specified, the script will download all
## possible files. Specifying the 'years' option will ignore the 'datadir'
## option and just download the relevant files to 'downloaddir' (see below)
## Specify years as:
## year1
## year1-year2
## year1,year2,year3
## year1-year2,year3-year4
## latest (downloads the most recent week's data)
## If this option is NOT specified, the parse will run on the contents of 'datadir'
#
# years=2010-2013
## 'downloaddir' specifies the target base directory into which the weekly
## patent files will be downloaded. Note that the preprocessor will create
## directories named for each year inside 'downloaddir', and if they already
## exist, will look inside for previously downloaded files
## If this option is NOT specified, the parse will run on the contents of 'datadir'
#
# downloaddir=/path/to/base/directory/for/downloads
# example configuration for a parse of 2012 data. Note that the 'grantregex'
# option is not specified because the default value is sufficient
[2012parse]
datadir=/data/patentdata/patents/2012
# example configuration to test the parsing
[test]
datadir=test/fixtures/xml
grantregex=\d{4}_\d.xml
applicationregex=ipa\d{6}.*.xml
# example configuration for a parse of the latest data. Note that the
# regexes for grants and applications will be used if 'all' is specified
# for doctype in [process], and otherwise only the appropriate one will be used.
[download]
years=latest
downloaddir=./data
grantregex=i?pg\d{6}.xml
applicationregex=i?pa\d{6}.xml
# This section specifies which grant_handler is to be used for each date of the
# released patent. This section should only have to be touched when a new parser is
# introduced. In the case where a year cannot be parsed from the filename (the
# format `ipgYYMMDD` is assumed), then the default parser is used.
# The dates in the ranges are either YYYY or YYYYMMDD. If only one date is provided,
# then the corresponding handler is assumed for all subsequent patents
[grant-xml-handlers]
2005-20130108=lib.handlers.grant_handler_v42
20130115=lib.handlers.grant_handler_v44
default=lib.handlers.grant_handler_v42
[application-xml-handlers]
2001-20060822=lib.handlers.application_handler_v41
20060823-20130116=lib.handlers.application_handler_v42
20130117=lib.handlers.application_handler_v43
default=lib.handlers.application_handler_v42
# schema changes were in 20010131 (v15), 20020101 (v16),
# 20050825 (v41), 20060823 (v42), 20130121 (v43)