diff --git a/.travis.yml b/.travis.yml index ccea69363b..3bca8458fe 100644 --- a/.travis.yml +++ b/.travis.yml @@ -8,8 +8,8 @@ before_install: - gem install sass --version "=3.5.3" install: - - pip install -r requirements.txt - - pip install '.[test,website]' + - pip install -r requirements-dev.txt + - pip install '.[dev]' - whotracksme --help script: diff --git a/README.md b/README.md deleted file mode 100644 index 8292662529..0000000000 --- a/README.md +++ /dev/null @@ -1,117 +0,0 @@ - - - - -Bringing Transparency to online tracking - built by Cliqz and Ghostery. - -___ - -This repository contains: - -* data on trackers and websites as shown on [whotracks.me](https://whotracks.me) (WTM) -* database mapping tracker domains to companies -* code to render the [whotracks.me](https://whotracks.me) site - - -# Installation - -Python 3.x is needed to build the site. We recommend creating a -[virtualenv](http://docs.python-guide.org/en/latest/dev/virtualenvs/) (or -`pipenv`) to install the dependencies. - -Furthermore, you will need to [install sass](http://sass-lang.com/install). - - -## From Pypi - -```sh -$ pip install whotracksme -``` - - -## From source - -```sh -$ pip install -e . -``` - -That's all you need to get started! - - -# Using the data - -To get started with the data, everything you need can be found in -`whotracksme.data`: - -```python -from whotracksme.data.loader import DataSource - -# available entities -DataSource().trackers -DataSource().companies -DataSource().sites -``` - -For examples of scripts, have a look in the [contrib](./contrib) folder! - - -# Building the site - -Building the site requires a few extra dependencies, not installed by default to -not make the installation heavier than it needs to be. You will need to install -`whotracksme` this way: - -```sh -$ pip install 'whotracksme[website]' -``` - -Or if you do it from source: -```sh -$ pip install -e '.[website]' -``` - - -Once this is done, you will have access to a `whotracksme` entry point that can -be used this way: - -```sh -$ whotracksme website [serve] -``` - -The `serve` part is optional and can be used while making changes on the -website. - -All generated artifacts can be found in the `_site/` folder. - - -## Tests - -To run tests, you will need `pytest`, or simply install `whotacksme` with the -`test` extra: - -```sh -$ pip install -e '.[test,website]' -$ pytest -``` - - -# Contributing - -We are happy to take contributions on: -* Guest articles for our blog in the topics of tracking, privacy and security. Feel free to use the data in this repository if you need inspiration. -* Feature requests that are doable using the WTM database. -* Curating our database of tracker profiles. Open an issue if you spot anything odd. - - -# Right to Amend -Please read our [Guideline for 3rd parties](https://github.com/cliqz-oss/whotracks.me/blob/master/RIGHT_TO_AMEND.md) -wanting to suggest corrections to their data. - - -# License - -The content of this project itself is licensed under the [Creative Commons Attribution 4.0 license](https://creativecommons.org/licenses/by/4.0/), and the underlying -source code used to generate and display that content is licensed under the [MIT license](https://github.com/cliqz-oss/whotracks.me/blob/master/LICENSE.md). diff --git a/README.rst b/README.rst new file mode 100644 index 0000000000..735e7d3ecc --- /dev/null +++ b/README.rst @@ -0,0 +1,132 @@ + +.. image:: https://raw.githubusercontent.com/cliqz-oss/whotracks.me/master/static/img/who-tracksme-logo.png + :width: 200px + :alt: whotracks.me + :target: https://whotracks.me + +Bringing Transparency to online tracking - built by Cliqz and Ghostery. + +----------------------------------------------------------------------- + +This repository contains: + +- data on trackers and websites as shown on `whotracks.me`_ (WTM) +- database mapping tracker domains to companies +- code to render the `whotracks.me`_ site + +Installation +============ + +Python 3.6 is needed to build the site. We recommend creating a +`virtualenv`_ (or ``pipenv``) to install the dependencies. + +Furthermore, you will need to `install sass`_. + +From Pypi +--------- + +.. code:: sh + + $ pip install whotracksme + +From source +----------- + +.. code:: sh + + $ pip install -e . + +That’s all you need to get started! + +Using the data +============== + +To get started with the data, everything you need can be found in +``whotracksme.data``: + +.. code:: python + + from whotracksme.data.loader import DataSource + + # available entities + DataSource().trackers + DataSource().companies + DataSource().sites + +For examples of scripts, have a look in the `contrib`_ folder! + +Building the site +================= + +Building the site requires a few extra dependencies, not installed by +default to not make the installation heavier than it needs to be. You +will need to install ``whotracksme`` from the repository, because not +all assets are packaged with ``whotracksme`` released on pypi: + +.. code:: sh + + $ pip install -e '.[dev]' + +Once this is done, you will have access to a ``whotracksme`` entry point +that can be used this way: + +.. code:: sh + + $ whotracksme website [serve] + +The ``serve`` part is optional and can be used while making changes on +the website. + +All generated artifacts can be found in the ``_site/`` folder. + +Tests +----- + +To run tests, you will need ``pytest``, or simply install ``whotacksme`` +with the ``dev`` extra: + +.. code:: sh + + $ pip install -e '.[dev]' + $ pytest + +Publishing a new version +======================== + +.. code:: sh + + $ pip install twine + $ python setup.py sdist bdist_wheel + $ twine upload --username cliqz-oss dist/* + +Contributing +============ + +We are happy to take contributions on: \* Guest articles for our blog in +the topics of tracking, privacy and security. Feel free to use the data +in this repository if you need inspiration. \* Feature requests that are +doable using the WTM database. \* Curating our database of tracker +profiles. Open an issue if you spot anything odd. + +Right to Amend +============== + +Please read our `Guideline for 3rd parties`_ wanting to suggest +corrections to their data. + +License +======= + +The content of this project itself is licensed under the `Creative +Commons Attribution 4.0 license`_, and the underlying source code used +to generate and display that content is licensed under the `MIT +license`_. + +.. _whotracks.me: https://whotracks.me +.. _virtualenv: http://docs.python-guide.org/en/latest/dev/virtualenvs/ +.. _install sass: http://sass-lang.com/install +.. _contrib: ./contrib +.. _Guideline for 3rd parties: https://github.com/cliqz-oss/whotracks.me/blob/master/RIGHT_TO_AMEND.md +.. _Creative Commons Attribution 4.0 license: https://creativecommons.org/licenses/by/4.0/ +.. _MIT license: https://github.com/cliqz-oss/whotracks.me/blob/master/LICENSE.md + diff --git a/requirements-dev.txt b/requirements-dev.txt new file mode 100644 index 0000000000..4679b5071f --- /dev/null +++ b/requirements-dev.txt @@ -0,0 +1,43 @@ +aiofiles==0.3.2 +aiohttp==3.1.2 +argh==0.26.2 +async-timeout==2.0.1 +attrs==17.4.0 +certifi==2018.1.18 +chardet==3.0.4 +colour==0.1.5 +decorator==4.2.1 +docopt==0.6.2 +httptools==0.0.11 +idna==2.6 +idna-ssl==1.0.1 +ipython-genutils==0.2.0 +Jinja2==2.10 +jsonschema==2.6.0 +jupyter-core==4.4.0 +Markdown==2.6.11 +MarkupSafe==1.0 +more-itertools==4.1.0 +multidict==4.1.0 +nbformat==4.4.0 +numpy==1.14.2 +pandas==0.22.0 +pathtools==0.1.2 +plotly==2.5.1 +pluggy==0.6.0 +py==1.5.3 +pytest==3.5.0 +python-dateutil==2.7.2 +pytz==2018.4 +PyYAML==3.12 +requests==2.18.4 +sanic==0.7.0 +six==1.11.0 +squarify==0.3.0 +traitlets==4.3.2 +ujson==1.35 +urllib3==1.22 +uvloop==0.9.1 +watchdog==0.8.3 +websockets==4.0.1 +yarl==1.1.1 diff --git a/requirements.txt b/requirements.txt index 2fca13296e..48153ad4c4 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,35 +1,6 @@ -aiohttp==3.0.4 -aiofiles==0.3.2 -argh==0.26.2 -boto3==1.4.8 -certifi==2017.11.5 -chardet==3.0.4 -colour==0.1.5 -decorator==4.1.2 docopt==0.6.2 -httptools==0.0.9 -idna==2.6 -ipython-genutils==0.2.0 -Jinja2==2.10 -jsonschema==2.6.0 -jupyter-core==4.4.0 -Markdown==2.6.9 -MarkupSafe==1.0 -nbformat==4.4.0 +numpy==1.14.2 pandas==0.22.0 -pathtools==0.1.2 -plotly==2.2.2 -py==1.5.2 -pytest==3.2.5 -pytz==2017.3 -PyYAML==3.12 -requests==2.18.4 -sanic==0.6.0 +python-dateutil==2.7.2 +pytz==2018.4 six==1.11.0 -squarify==0.3.0 -traitlets==4.3.2 -ujson==1.35 -urllib3==1.22 -uvloop==0.8.1 -watchdog==0.8.3 -websockets==4.0.1 diff --git a/setup.py b/setup.py index 55feb62b67..9ca57e8c46 100644 --- a/setup.py +++ b/setup.py @@ -1,16 +1,34 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +import os +import pathlib + from setuptools import setup, find_packages -from os import path -HERE = path.abspath(path.dirname(__file__)) + +PKGNAME = 'whotracksme' LONG_DESCRIPTION = '' -with open(path.join(HERE, 'README.md')) as readme_file: +with pathlib.Path('README.rst').open() as readme_file: LONG_DESCRIPTION = readme_file.read() + +# List all resources under whotracksme/data/ +assets = [] +DATA_DIR = pathlib.Path('whotracksme/data') +for root, dirs, files in os.walk(DATA_DIR): + assets.extend( + pathlib.Path(root, f).relative_to(DATA_DIR) + for f in files + if f.endswith('.csv') or f.endswith('.sql') + ) + + setup( - name='whotracksme', - version='2018.03', + name=PKGNAME, + version='2018.3', description='Learn about tracking technologies, market structure and data-sharing on the web', long_description=LONG_DESCRIPTION, classifiers=[ @@ -41,33 +59,28 @@ ]), install_requires=[ 'docopt', + 'pandas' ], extras_require={ - 'website': [ + 'dev': [ + 'aiohttp', 'colour', 'jinja2', 'markdown', - 'pandas', 'plotly', + 'pytest', 'sanic', 'squarify', 'watchdog', ], - 'test': [ - 'pytest', - ], }, package_data={ - 'whotracksme': [ - 'data/assets/*/*/*.csv', - 'data/assets/*.sql', - ], + f'{PKGNAME}.data': assets, }, - include_package_data=True, - zip_safe=False, + zip_safe=True, entry_points={ 'console_scripts': [ - 'whotracksme=whotracksme.main:main', + f'{PKGNAME}={PKGNAME}.main:main', ], }, ) diff --git a/templates/base.html b/templates/base.html index 8db7646f58..477c190f86 100644 --- a/templates/base.html +++ b/templates/base.html @@ -26,7 +26,7 @@ - + {% block extra_styling %} {% endblock %} @@ -38,7 +38,7 @@ + @@ -51,4 +51,4 @@ {% endblock %} {% include "components/breadcrumb.html" %} -{% include "components/footer.html" %} \ No newline at end of file +{% include "components/footer.html" %} diff --git a/whotracksme/data/assets/__init__.py b/whotracksme/data/assets/__init__.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/whotracksme/data/loader.py b/whotracksme/data/loader.py index 092258e12b..9214623eb3 100644 --- a/whotracksme/data/loader.py +++ b/whotracksme/data/loader.py @@ -1,35 +1,51 @@ +from datetime import datetime from urllib.parse import quote_plus -from datetime import datetime, date +import io import sqlite3 -import pkgutil + +import pkg_resources import pandas as pd -import re -from pathlib import Path -def load_asset(name): - return pkgutil.get_data( - 'whotracksme', - f'data/assets/{name}' - ).decode('utf-8') + +def asset_string(name): + return pkg_resources.resource_string( + 'whotracksme.data', + f'assets/{name}').decode('utf-8') + + +def asset_stream(name): + stream = pkg_resources.resource_stream( + 'whotracksme.data', + f'assets/{name}', + ) + in_memory_stream = io.BytesIO(stream.read()) + stream.close() + return in_memory_stream def load_tracker_db(loc=':memory:'): connection = sqlite3.connect(loc) with connection: - connection.executescript(load_asset('trackerdb.sql')) + connection.executescript(asset_string('trackerdb.sql')) return connection -def get_data_dir(): - return Path(__file__).parent / 'assets' +def list_available_months(): + months = [] + for asset in pkg_resources.resource_listdir('whotracksme.data', 'assets'): + try: + datetime.strptime(asset, '%Y-%m') + except ValueError: + pass + else: + months.append(asset) + return months class DataSource: def __init__(self): - self.data_dir = get_data_dir() - month_matcher = re.compile('[0-9]{4}-[0-9]{2}') - self.data_months = sorted([p.parts[-1] for p in self.data_dir.iterdir() if p.is_dir() and month_matcher.fullmatch(p.parts[-1]) is not None]) + self.data_months = sorted(list_available_months()) print('data available for months:', self.data_months) # Add demographics info to trackers and companies @@ -39,24 +55,20 @@ def __init__(self): self.company_info = self.load_company_info(connection) self.sites_trackers = SitesTrackers( - data_dir=self.data_dir, data_months=[max(self.data_months)], tracker_info=self.app_info ) self.trackers = Trackers( - data_dir=self.data_dir, data_months=self.data_months, tracker_info=self.app_info, sites=self.sites_trackers ) self.companies = Companies( - data_dir=self.data_dir, data_months=self.data_months, company_info=self.company_info, tracker_info=self.app_info ) self.sites = Sites( - data_dir=self.data_dir, data_months=self.data_months, trackers=self.sites_trackers ) @@ -126,11 +138,11 @@ def load_company_info(self, connection): class PandasDataLoader: - def __init__(self, data_dir, data_months, name, region='global', id_column=None): + def __init__(self, data_months, name, region='global', id_column=None): self.last_month = max(data_months) self.df = pd.concat([ pd.read_csv( - f'{data_dir}/{month}/{region}/{name}.csv', + asset_stream(f'{month}/{region}/{name}.csv'), parse_dates=['month']) for month in data_months ]) @@ -156,8 +168,8 @@ def get_snapshot(self, month=None): class Trackers(PandasDataLoader): - def __init__(self, data_dir, data_months, tracker_info, sites, region='global'): - super().__init__(data_dir, data_months, name='trackers', region=region) + def __init__(self, data_months, tracker_info, sites, region='global'): + super().__init__(data_months, name='trackers', region=region) self.info = tracker_info # rename tracker column as id @@ -302,8 +314,8 @@ def iter_sites(self, id): class Sites(PandasDataLoader): - def __init__(self, data_dir, data_months, trackers, region='global'): - super().__init__(data_dir, data_months, name='sites', region=region) + def __init__(self, data_months, trackers, region='global'): + super().__init__(data_months, name='sites', region=region) self.trackers = trackers self.df['id'] = self.df['site'] # site -> category mapping @@ -380,8 +392,8 @@ def mean_trackers_timeseries(self, id): class SitesTrackers(PandasDataLoader): - def __init__(self, data_dir, data_months, tracker_info, region='global'): - super().__init__(data_dir, data_months, name='sites_trackers', region=region) + def __init__(self, data_months, tracker_info, region='global'): + super().__init__(data_months, name='sites_trackers', region=region) self.df['company_id'] = pd.Series( [tracker_info.get(tracker, {}).get('company_id', tracker) @@ -395,8 +407,8 @@ def get_site(self, site): class Companies(PandasDataLoader): - def __init__(self, data_dir, data_months, company_info, tracker_info, region='global'): - super().__init__(data_dir, data_months, name='companies', region=region) + def __init__(self, data_months, company_info, tracker_info, region='global'): + super().__init__(data_months, name='companies', region=region) self.df['id'] = self.df['company'] self.df['name'] = pd.Series([ company_info.get(row.company, tracker_info.get(row.company, {})).get('name', row.company) diff --git a/whotracksme/main.py b/whotracksme/main.py index 808156a45a..0654647840 100644 --- a/whotracksme/main.py +++ b/whotracksme/main.py @@ -17,10 +17,12 @@ """ -import docopt +from pathlib import Path import os import sqlite3 -from pathlib import Path + +import docopt + from whotracksme.website.builder import Builder from whotracksme.website.serve import serve from whotracksme.data.loader import load_tracker_db diff --git a/whotracksme/website/builder.py b/whotracksme/website/builder.py index c47b2ecbb5..077df92801 100644 --- a/whotracksme/website/builder.py +++ b/whotracksme/website/builder.py @@ -68,82 +68,96 @@ def on_blog_folder_change(self): def feed_event(self, event): futures = [] - ################################################################### - # This needs to be first, as other tasks will need to write in # - # the resulting folders. # - ################################################################### - - # Depends on folder: 'static/' - if event & STATIC_FOLDER: - create_site_structure(static_path=STATIC_PATH) - print_progress(text='Create _site') - - - ################################################################### - # We then reload data in memory, before generating the site # - ################################################################### - - # Depends on folder: 'data/' - if self.data_source is None or event & DATA_FOLDER: - # class where all data can be accessed from - data_source = DataSource() - print_progress(text='Load data sources') - - # Depends on: 'blog/' - if self.blog_posts is None or event & BLOG_FOLDER: - self.blog_posts = load_blog_posts() - print_progress(text='Load blog posts') - - - ################################################################### - # Once site structure has been created and data is refreshed, we # - # can build all parts of the site in parallel, since there is no # - # dependencies between them. # - ################################################################### - - # Depends on: 'templates/', 'data/' - if event & DATA_FOLDER or event & TEMPLATES_FOLDER: - print_progress(text='Generate error pages') - copy_custom_error_pages(data=data_source) - - # Depends on: 'data/', 'templates/' - if event & DATA_FOLDER or event & TEMPLATES_FOLDER: - # Home - build_home(data=data_source) - - # Trackers - build_trackers_list(data=data_source) - build_tracker_pages(data=data_source) - - # Websites - build_website_list(data=data_source) - build_website_pages(data=data_source) - - # Companies - build_company_reach_chart_page(data=data_source) - - # Depends on: 'data/', 'blog/', 'templates/' - if event & DATA_FOLDER or event & BLOG_FOLDER or event & TEMPLATES_FOLDER: - build_blogpost_list( - data=data_source, - blog_posts=self.blog_posts - ) - - build_blogpost_pages( - data=data_source, - blog_posts=self.blog_posts - ) - - # Depends on: 'data/', 'blog/', 'templates/' - if event & DATA_FOLDER or event & BLOG_FOLDER or event & TEMPLATES_FOLDER: - generate_sitemap( - data=data_source, - blog_posts=self.blog_posts - ) - - # TODO: uncomment when company profiles are ready - # if args['site'] or args['companies']: - # company_process = Process(target=build_company_pages, args=(data_source,)) - # company_process.start() - - print('Done') + with concurrent.futures.ProcessPoolExecutor(max_workers=9) as executor: + ################################################################### + # This needs to be first, as other tasks will need to write in # + # the resulting folders. # + ################################################################### + + # Depends on folder: 'static/' + if event & STATIC_FOLDER: + create_site_structure(static_path=STATIC_PATH) + print_progress(text='Create _site') + + + ################################################################### + # We then reload data in memory, before generating the site # + ################################################################### + + # Depends on folder: 'data/' + if self.data_source is None or event & DATA_FOLDER: + # class where all data can be accessed from + data_source = DataSource() + print_progress(text='Load data sources') + + # Depends on: 'blog/' + if self.blog_posts is None or event & BLOG_FOLDER: + self.blog_posts = load_blog_posts() + print_progress(text='Load blog posts') + + + ################################################################### + # Once site structure has been created and data is refreshed, we # + # can build all parts of the site in parallel, since there is no # + # dependencies between them. # + ################################################################### + + # Depends on: 'templates/', 'data/' + if event & DATA_FOLDER or event & TEMPLATES_FOLDER: + print_progress(text='Generate error pages') + copy_custom_error_pages(data=data_source) + + # Depends on: 'data/', 'templates/' + if event & DATA_FOLDER or event & TEMPLATES_FOLDER: + # Home + # build_home(data=data_source) + futures.append(executor.submit(build_home, data=data_source)) + + # Trackers + futures.append(executor.submit(build_trackers_list, data=data_source)) + futures.append(executor.submit(build_tracker_pages, data=data_source)) + + # Websites + futures.append(executor.submit(build_website_list, data=data_source)) + futures.append(executor.submit(build_website_pages, data=data_source)) + + # Companies + futures.append(executor.submit(build_company_reach_chart_page, data=data_source)) + + # Depends on: 'data/', 'blog/', 'templates/' + if event & DATA_FOLDER or event & BLOG_FOLDER or event & TEMPLATES_FOLDER: + futures.append(executor.submit( + build_blogpost_list, + data=data_source, + blog_posts=self.blog_posts + )) + + futures.append(executor.submit( + build_blogpost_pages, + data=data_source, + blog_posts=self.blog_posts + )) + + # Depends on: 'data/', 'blog/', 'templates/' + if event & DATA_FOLDER or event & BLOG_FOLDER or event & TEMPLATES_FOLDER: + futures.append(executor.submit( + generate_sitemap, + data=data_source, + blog_posts=self.blog_posts + )) + + # TODO: uncomment when company profiles are ready + # if args['site'] or args['companies']: + # company_process = Process(target=build_company_pages, args=(data_source,)) + # company_process.start() + + # Wait for all jobs to finish + concurrent.futures.wait(futures) + + # Getting the `result` of each promise (although none is expected) + # allows to re-raise exception happening in children processes. If + # we don't do it, exceptions will be silently ignored. + for future in futures: + future.result() + + print('Done')