-
Notifications
You must be signed in to change notification settings - Fork 666
/
extractor.py
53 lines (47 loc) · 1.63 KB
/
extractor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
import re
import pandas as pd
import numpy as np
term_pat = re.compile('(.+?):(.+)')
pat = re.compile(
'(.+)_([a-z][a-z]\.)?((?:wikipedia\.org)|(?:commons\.wikimedia\.org)|(?:www\.mediawiki\.org))_([a-z_-]+?)$')
# Debug output to ensure pattern still works
# print(pat.fullmatch('BLEACH_zh.wikipedia.org_all-accessspider').groups())
# print(pat.fullmatch('Accueil_commons.wikimedia.org_all-access_spider').groups())
def extract(source) -> pd.DataFrame:
"""
Extracts features from url. Features: agent, site, country, term, marker
:param source: urls
:return: DataFrame, one column per feature
"""
if isinstance(source, pd.Series):
source = source.values
agents = np.full_like(source, np.NaN)
sites = np.full_like(source, np.NaN)
countries = np.full_like(source, np.NaN)
terms = np.full_like(source, np.NaN)
markers = np.full_like(source, np.NaN)
for i in range(len(source)):
l = source[i]
match = pat.fullmatch(l)
assert match, "Non-matched string %s" % l
term = match.group(1)
country = match.group(2)
if country:
countries[i] = country[:-1]
site = match.group(3)
sites[i] = site
agents[i] = match.group(4)
if site != 'wikipedia.org':
term_match = term_pat.match(term)
if term_match:
markers[i] = term_match.group(1)
term = term_match.group(2)
terms[i] = term
return pd.DataFrame({
'agent': agents,
'site': sites,
'country': countries,
'term': terms,
'marker': markers,
'page': source
})