diff --git a/README.md b/README.md index f8da12dd9..f000dbf22 100644 --- a/README.md +++ b/README.md @@ -102,10 +102,22 @@ Don't forget to checkout the [examples] folder for detailed demonstration! ### Connector -Connector provides a simple way to collect data from different websites, offering several benefits: +Connector provides a simple programming interface to collect structured data from different Web APIs (e.g., Twitter API, Yelp Fusion API, Spotify API, DBLP API), making web data collection easy and efficient, without requiring advanced programming skills. + + +Do you want to leverage the growing number of websites that are opening their data through public APIs?. + + +Connector is for you! + +Let's check out the several benefits that Connector offers: + * A unified API: you can fetch data using one or two lines of code to get data from many websites. + * Auto Pagination: it automatically does the pagination for you so that you can specify the desired count of the returned results without even considering the count-per-request restriction from the API. + * Smart API request strategy: it can issue API requests in parallel while respecting the rate limit policy. + In the following examples, you can download the Yelp business search result into a pandas DataFrame, using only two lines of code, without taking deep looking into the Yelp documentation! diff --git a/assets/connector_concurrency.gif b/assets/connector_concurrency.gif new file mode 100644 index 000000000..e99cfc948 Binary files /dev/null and b/assets/connector_concurrency.gif differ diff --git a/assets/connector_main.gif b/assets/connector_main.gif new file mode 100644 index 000000000..1d40bf0a1 Binary files /dev/null and b/assets/connector_main.gif differ diff --git a/assets/connector_pagination.gif b/assets/connector_pagination.gif new file mode 100644 index 000000000..6f8a685c1 Binary files /dev/null and b/assets/connector_pagination.gif differ diff --git a/dataprep/connector/config_generator.py b/dataprep/connector/config_generator.py new file mode 100644 index 000000000..8e3020eb8 --- /dev/null +++ b/dataprep/connector/config_generator.py @@ -0,0 +1,209 @@ +""" + This module implements the generation of connector configuration files +""" +import re +import json +from urllib import parse +from requests import Request, Response, Session + + +def create_config(example: str) -> "ConfigGenerator": + """ + Creates a ConfigGenerator object which has an in-memory + representation of a configuration file + + Returns + ------- + ConfigGenerator + The ConfigGenerator instance. + """ + config_gen = ConfigGenerator() + config_gen.add_example(example) + return config_gen + + +# pylint: disable=too-many-instance-attributes +class ConfigGenerator: + """ + Class that generate configuration files according to + input information provided by the user, for example + an HTTP request example from a REST API. + + Example + ------- + >>> from dataprep.connector import config_generator as cg + >>> req_example = "GET https://openlibrary.org/api/books?bibkeys=ISBN:0385472579&format=json" + >>> config = cg.create_config(req_example) + + """ + + _request_example: str + _url: str + _parameters: dict + _content_type: str + _table_path: str + _version: int + _request: dict + _response: dict + _method: str + _schema_cols: list + _headers: dict + _orient: str + _session: Session + _config: str + + def __init__(self) -> None: + self._request_example = str() + self._url = str() + self._parameters = dict() + self._content_type = str() + self._table_path = "$[*]" + self._version = 1 + self._request = dict() + self._response = dict() + self._method = "GET" + self._schema_cols = list() + self._headers = dict() + self._orient = "records" + self._session = Session() + self._config = str() + + def add_example(self, request_example: str) -> "ConfigGenerator": + """ + Parse the request example, execute the request, create the in-memory + representation of a configuration file and returns the corresponding + ConfigGenerator object. + + Parameters + ---------- + request_example + The HTTP request example, e.g.: + GET https://openlibrary.org/api/books?bibkeys=ISBN:0385472579&format=json + + Returns + ------- + ConfigGenerator + The ConfigGenerator instance created from the request example. + """ + + self._parse_example(request_example) + self._execute_request() + self._create_config_file_representation() + return self + + def _parse_example(self, request_example: str) -> None: + """ + Parse the request example extracting all the relevant information to perform + a request. + + Parameters + ---------- + request_example + The HTTP request example, for example: + GET https://openlibrary.org/api/books?bibkeys=ISBN:0385472579&format=json + """ + self._request_example = request_example + try: + request_full_url = re.search( + "(?Phttps?://[^\s]+)", self._request_example + ).group("url") + except Exception: + raise RuntimeError( + f"Malformed request example syntax: \ + {self._request_example}" + ) from None + else: + parsed_full_url = parse.urlparse(request_full_url) + self._parameters = parse.parse_qs(parsed_full_url.query) + if len(self._parameters) != 0: + lst_parsed_full_url = list(parsed_full_url) + lst_parsed_full_url[4] = str() + self._url = parse.urlunparse(lst_parsed_full_url) + else: + raise RuntimeError( + f"Malformed request example syntax: \ + {self._request_example}" + ) from None + + def _execute_request(self) -> None: + """ + Execute an HTTP request taking as input all the parameters extracted from + the request example, then, extract all the relevant information from the + received HTTP response. + """ + request = Request( + method=self._method, + url=self._url, + headers=self._headers, + params=self._parameters, + json=None, + data=None, + cookies=dict(), + ) + prep_request = request.prepare() + resp: Response = self._session.send(prep_request) + if resp.status_code == 200: + self._content_type = resp.headers["content-type"] + try: + self._response = resp.json() + except ValueError: + raise RuntimeError( + f"Response body from {self._url} \ + does not contain a valid JSON." + ) from None + else: + raise RuntimeError( + f"HTTP status received: {resp.status_code}. \ + Expected: 200." + ) from None + + def _create_config_file_representation(self) -> None: + """ + Creates an in-memory representation (string) of a configuration file. + """ + if len(self._response) == 0: + self._schema_cols = list() + else: + self._schema_cols = list(dict(list(self._response.values())[0]).keys()) + config = { + "version": self._version, + "request": { + "url": self._url, + "method": self._method, + "params": {p: False for p in self._parameters}, + }, + "response": { + "ctype": "application/json", + "tablePath": self._table_path, + "schema": { + sc: {"target": "$." + sc, "type": "string"} + for sc in self._schema_cols + }, + "orient": self._orient, + }, + } + self._config = json.dumps(config, indent=4) + + def save(self, filename: str) -> None: + """ + Save to disk the current in-memory representation (string) of a configuration file to a + file specified as parameter. + + Parameters + ---------- + filename + Name of the file to be saved. It can include the path. + """ + with open(filename, "w") as outfile: + outfile.write(self._config) + + def to_string(self) -> str: + """ + Return the current in-memory representation (string) of a configuration file. + + Returns + ------- + _config + String of the in-memory representation (string) of a configuration file. + """ + return self._config