Skip to content

Commit

Permalink
Merge pull request #1 from cuenca-mx/scrapper-dhl
Browse files Browse the repository at this point in the history
Scrapper dhl
  • Loading branch information
Ricardo authored Jan 16, 2020
2 parents 7cbd202 + 5d69e5e commit 2b87621
Show file tree
Hide file tree
Showing 23 changed files with 7,631 additions and 207 deletions.
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,9 @@ venv.bak/

# mypy
.mypy_cache/
/.dmypy.json

*.pdf

# pycharm
.idea/
Expand All @@ -118,3 +121,4 @@ venv.bak/
# others
.DS_Store


14 changes: 14 additions & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -28,3 +28,17 @@ jobs:
- export PYTHONPATH=`pwd`
- pytest --vcr-record=none
- coveralls
- stage: PyPI Upload
python: 3.7
dist: xenial
sudo: true
script: echo "Uploading packages to PyPI"
deploy:
provider: pypi
user: cuenca
password:
secure: "I5xOa610g2M0rG6I+lm3V0ROyDdfb9738OnY+Z3IZzUDo4KvTO3aIBZatMw58rW4hNAAvAyjWE0Je0ABybmcBKLf42CbR+XgRnYoKhYECVgpdWuZsIWAKGf1dmbOo/AE+LEjSSaLNuLea7+x+586uECOFB5YYyKnC5QnLj19481SdS27l4UcUOQ8JFNEMKmPhZgAAXfptsC52/pxkoMC9xdk8sVf/WHOavnB4vzcrlLIYsbOdbU2HXaUxR42TUs7k5ETEnwG3pU1Ic9EPHkFVsh/2DJEHN8fCTnEig84xopZwnlnSYI2FqBB6xp8by5D5rpfTrFY6GZ4oUYwH9u8sOVs0SVTCBDQMht2qKuKA367wQcAvHRQ60OlL6UaOKu4G4sBrEEdNMpm9XewdrMeymkJMXP1+3Rlq+qbv0QxXe12NpK53Svvm7laMC1uGZ7Qgxcwk05dTibaHlUfCzrP79m1DZ0aqyN5IgrOXEpmyJzdDa0TX7dG4OqPCLckHKmlhkAMCq2CQALd5DZ2iBKE1rECrdbsPokB9hFEIjSI9lTe4wbyFN4kUUfdDi9dtIsljVIZkDJjM1Ned2uez0jtL83mfwXXcqMdWHDcFwxQXXc+H2gjdS4MbbmiEypDmuGAlBN3XqycwnkKc/LldK0oOD9QsKle3MCS3hb7885LhVc="

on:
tags: true
distributions: sdist bdist_wheel
1 change: 1 addition & 0 deletions dhlmex/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
__all__ = ['__version__', 'Client']


from .client import Client
from .version import __version__
70 changes: 56 additions & 14 deletions dhlmex/client.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,18 @@
import os
from typing import Any, ClassVar, Dict, Optional

from requests import Response, Session
from requests import HTTPError, Response, Session, codes
from requests.exceptions import SSLError

from .resources import Resource
from .exceptions import DhlmexException
from .resources import Guide, PostCode, Resource

API_URL = 'https://prepaid.dhl.com.mx/Prepago'
USER_AGENT = (
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 '
'(KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36'
)
DHL_CERT = 'prepaid-dhl-com-mx.pem'


class Client:
Expand All @@ -19,30 +22,69 @@ class Client:
session: Session

# resources
...
guides: ClassVar = Guide
post_codes: ClassVar = PostCode

def __init__(
self, username: Optional[str] = None, password: Optional[str] = None,
):

username = username or os.environ['DHLMEX_USERNAME']
password = password or os.environ['DHLMEX_PASSWORD']
self.session = Session()
self.session.headers['User-Agent'] = USER_AGENT
if os.getenv('DEBUG'):
print(f'Client using Charles certificate')
self.session.verify = DHL_CERT
self._login(username, password)

Resource._client = self

def _login(self, username: str, password: str) -> Response:
self.get('/') # Initialize cookies
endpoint = '/jsp/app/login/login.xhtml'
data = {
'AJAXREQUEST': '_viewRoot',
'j_id6': 'j_id6',
'j_id6:j_id20': username,
'j_id6:j_id22': password,
'javax.faces.ViewState': 'j_id4',
'j_id6:j_id29': 'j_id6:j_id29',
}
return self.post(endpoint, data)
try:
self.get('/') # Initialize cookies
endpoint = '/jsp/app/login/login.xhtml'
data = {
'AJAXREQUEST': '_viewRoot',
'j_id6': 'j_id6',
'j_id6:j_id20': username,
'j_id6:j_id22': password,
'javax.faces.ViewState': 'j_id1',
'j_id6:j_id29': 'j_id6:j_id29',
}
resp = self.post(endpoint, data)
except HTTPError as httpe:
if 'Su sesión ha caducado' in resp.text:
raise DhlmexException('Session has expired')
else:
raise httpe
except SSLError:
raise DhlmexException('Cient on debug, but Charles not running')
# DHL always return 200 although there is an existing session
if 'Ya existe una sesión' in resp.text:
raise DhlmexException(
f'There is an exisiting session on DHL for {username}'
)
return resp

def _logout(self) -> Response:
endpoint = '/jsp/app/inicio/inicio.xhtml'
resp = self.post(endpoint, {})
if 'Login / Admin' in resp.text:
return resp # No need to logout
data = Resource.get_data(
resp, Resource._actions['close'],
) # Obtain headers to end properly the session
try:
resp = self.post(endpoint, data)
except HTTPError as httpe:
if 'Su sesión ha caducado' in httpe.response.text:
resp = Response()
resp.status_code = codes.ok
return resp
else:
raise httpe
return resp

def get(self, endpoint: str, **kwargs: Any) -> Response:
return self.request('get', endpoint, {}, **kwargs)
Expand Down
4 changes: 4 additions & 0 deletions dhlmex/exceptions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
class DhlmexException(Exception):
"""
An error has occurred during DHL scrapping
"""
4 changes: 3 additions & 1 deletion dhlmex/resources/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
__all__ = ['Resource']
__all__ = ['Guide', 'PostCode', 'Resource']

from .base import Resource
from .guides import Guide
from .post_codes import PostCode
56 changes: 54 additions & 2 deletions dhlmex/resources/base.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,57 @@
from typing import ClassVar
import re
from typing import ClassVar, Dict

from bs4 import BeautifulSoup
from requests import Response

from dhlmex.exceptions import DhlmexException


class Resource:
_client: ClassVar['cepmex.Client'] # type: ignore
_client: ClassVar["dhlmex.Client"] # type: ignore
_urls: Dict[str, str] = {
'login': '/jsp/app/login/login.xhtml',
'home': '/jsp/app/inicio/inicio.xhtml',
'guide': '/jsp/app/cliente/impresionClienteSubUsuario.xhtml',
'capture': '/jsp/app/cliente/capturaDatosImpresionClienteSU.xhtml',
'print': '/jsp/app/cliente/guiasImpresas.xhtml',
'pdf': '/generaImpresionPDF',
}
_actions: Dict[str, Dict[str, str]] = {
'close': {
'text': 'Cerrar Sesión',
'code': 'j_id9:j_id26',
'end': 'j_id9:j_id30',
},
'print': {
'text': 'Impresión Sub Usuario',
'code': 'j_id9:j_id14',
'end': 'j_id9:j_id16',
},
'download': {
'text': 'Guías Impresas',
'code': 'j_id9:j_id18',
'end': 'j_id9:j_id10',
},
}

@staticmethod
def get_data(resp: Response, action: Dict) -> Dict:
if 'Login / Admin' in resp.text:
raise DhlmexException('Client not logged in')
soup = BeautifulSoup(resp.text, features='html.parser')
view_state = soup.find('input', id='javax.faces.ViewState').attrs[
'value'
]
js = soup.find('a', text=action['text']).attrs['onclick']
matches = re.findall(r"\'(.+?)\'", js)
form_ids = [match for match in matches if match.startswith('j_id')]
j_pair_id = form_ids[1].split(',')[0]
j_id = form_ids[0]

return {
j_id: j_id,
j_pair_id: action['code'],
'javax.faces.ViewState': view_state,
action['end']: action['end'],
}
14 changes: 14 additions & 0 deletions dhlmex/resources/destination.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
from dataclasses import dataclass


@dataclass
class Destination:
company: str
contact: str
mail: str
phone: str
address1: str
postal_code: str
neighborhood: str
city: str
state: str
Loading

0 comments on commit 2b87621

Please sign in to comment.