diff --git a/requirements-sphinx.txt b/requirements-sphinx.txt index 995d297e..e57747ad 100644 --- a/requirements-sphinx.txt +++ b/requirements-sphinx.txt @@ -1,7 +1,7 @@ # Should be the same for requirements.txt: chardet>=2.0.1,<=2.3 dnspython3==1.12 -html5lib>=0.999,<1.0 +html5lib>=0.999,<=0.9999999 # lxml>=3.1.0,<=3.5 # except for this because it requires building C libs namedlist>=1.3,<=1.7 psutil>=2.0,<=4.2 diff --git a/requirements.txt b/requirements.txt index e9fbfb52..f4d88342 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,7 @@ # Absolutely known to work versions only: chardet>=2.0.1,<=2.3 dnspython3==1.12 -html5lib>=0.999,<1.0 +html5lib>=0.999,<=0.9999999 lxml>=3.1.0,<=3.5 namedlist>=1.3,<=1.7 psutil>=2.0,<=4.2 diff --git a/wpull/document/htmlparse/html5lib_.py b/wpull/document/htmlparse/html5lib_.py index 6f247437..836818db 100644 --- a/wpull/document/htmlparse/html5lib_.py +++ b/wpull/document/htmlparse/html5lib_.py @@ -4,6 +4,7 @@ import io import os.path +from wpull.collections import FrozenDict, EmptyFrozenDict from wpull.document.htmlparse.base import BaseParser from wpull.document.htmlparse.element import Comment, Doctype, Element @@ -44,11 +45,11 @@ def parse(self, file, encoding=None): buffer = None if tail_buffer: - yield Element(tag, dict(), None, tail_buffer.getvalue(), True) + yield Element(tag, EmptyFrozenDict(), None, tail_buffer.getvalue(), True) tail_buffer = None tag = token['name'] - attrib = dict(token['data']) + attrib = FrozenDict(dict(token['data'])) buffer = io.StringIO() if token['name'] == 'script': @@ -66,7 +67,7 @@ def parse(self, file, encoding=None): buffer = None if tail_buffer: - yield Element(tag, dict(), None, tail_buffer.getvalue(), True) + yield Element(tag, EmptyFrozenDict(), None, tail_buffer.getvalue(), True) tail_buffer = None tail_buffer = io.StringIO() @@ -87,7 +88,7 @@ def parse(self, file, encoding=None): buffer = None if tail_buffer: - yield Element(tag, dict(), None, tail_buffer.getvalue(), True) + yield Element(tag, EmptyFrozenDict(), None, tail_buffer.getvalue(), True) tail_buffer = None diff --git a/wpull/document/htmlparse/lxml_.py b/wpull/document/htmlparse/lxml_.py index a37cb818..19999321 100644 --- a/wpull/document/htmlparse/lxml_.py +++ b/wpull/document/htmlparse/lxml_.py @@ -3,6 +3,7 @@ import lxml.html +from wpull.collections import EmptyFrozenDict, FrozenDict from wpull.document.htmlparse.base import BaseParser from wpull.document.htmlparse.element import Element, Comment from wpull.document.xml import XMLDetector @@ -36,7 +37,7 @@ def start(self, tag, attrib): if self.tail_buffer: self.callback(Element( - self.tag, dict(), + self.tag, EmptyFrozenDict(), None, self.tail_buffer.getvalue(), True @@ -44,7 +45,7 @@ def start(self, tag, attrib): self.tail_buffer = None self.tag = tag - self.attrib = attrib + self.attrib = FrozenDict(attrib) self.buffer = io.StringIO() def data(self, data): @@ -65,7 +66,7 @@ def end(self, tag): if self.tail_buffer: self.callback(Element( - self.tag, dict(), + self.tag, EmptyFrozenDict(), None, self.tail_buffer.getvalue(), True @@ -89,7 +90,7 @@ def close(self): if self.tail_buffer: self.callback(Element( - self.tag, dict(), + self.tag, EmptyFrozenDict(), None, self.tail_buffer.getvalue(), True diff --git a/wpull/proxy/server.py b/wpull/proxy/server.py index cc78b77c..7ad48732 100644 --- a/wpull/proxy/server.py +++ b/wpull/proxy/server.py @@ -98,6 +98,7 @@ def __init__(self, http_client: Client, reader: asyncio.StreamReader, writer: as self._http_client = http_client self._reader = self._original_reader = reader self._writer = self._original_writer = writer + self._is_tunnel = False self._is_ssl_tunnel = False self._cert_filename = wpull.util.get_package_filename('proxy/proxy.crt') @@ -129,7 +130,7 @@ def _process_request(self, request: Request): _logger.debug(__('Got request {0}', request)) if request.method == 'CONNECT': - self._reject_request('CONNECT is intentionally not supported') + yield from self._start_connect_tunnel() return if self._is_ssl_tunnel and request.url.startswith('http://'): @@ -202,6 +203,117 @@ def _process_request(self, request: Request): _logger.debug('Response done.') + @asyncio.coroutine + def _start_connect_tunnel(self): + if self._is_tunnel: + self._reject_request('Cannot CONNECT within CONNECT') + return + + self._is_tunnel = True + + original_socket = yield from self._detach_socket_and_start_tunnel() + is_ssl = yield from self._is_client_request_ssl(original_socket) + + if is_ssl: + _logger.debug('Tunneling as SSL') + yield from self._start_ssl_tunnel() + else: + yield from self._rewrap_socket(original_socket) + + @classmethod + @asyncio.coroutine + def _is_client_request_ssl(cls, socket_: socket.socket) -> bool: + while True: + original_timeout = socket_.gettimeout() + socket_.setblocking(False) + + try: + data = socket_.recv(3, socket.MSG_PEEK) + except OSError as error: + if error.errno in (errno.EWOULDBLOCK, errno.EAGAIN): + yield from asyncio.sleep(0.01) + else: + raise + else: + break + finally: + socket_.settimeout(original_timeout) + + _logger.debug('peeked data %s', data) + if all(ord('A') <= char_code <= ord('Z') for char_code in data): + return False + else: + return True + + @asyncio.coroutine + def _start_ssl_tunnel(self): + '''Start SSL protocol on the socket.''' + + self._is_ssl_tunnel = True + ssl_socket = yield from self._start_ssl_handshake() + yield from self._rewrap_socket(ssl_socket) + + @asyncio.coroutine + def _detach_socket_and_start_tunnel(self) -> socket.socket: + socket_ = self._writer.get_extra_info('socket') + + try: + asyncio.get_event_loop().remove_reader(socket_.fileno()) + except ValueError as error: + raise ConnectionAbortedError() from error + + self._writer.write(b'HTTP/1.1 200 Connection established\r\n\r\n') + yield from self._writer.drain() + + try: + asyncio.get_event_loop().remove_writer(socket_.fileno()) + except ValueError as error: + raise ConnectionAbortedError() from error + + return socket_ + + @asyncio.coroutine + def _start_ssl_handshake(self): + socket_ = self._writer.get_extra_info('socket') + + ssl_socket = ssl.wrap_socket( + socket_, server_side=True, + certfile=self._cert_filename, + keyfile=self._key_filename, + do_handshake_on_connect=False + ) + + # FIXME: this isn't how to START TLS + for dummy in range(1200): + try: + ssl_socket.do_handshake() + break + except ssl.SSLError as error: + if error.errno in (ssl.SSL_ERROR_WANT_READ, ssl.SSL_ERROR_WANT_WRITE): + _logger.debug('Do handshake %s', error) + yield from asyncio.sleep(0.05) + else: + raise + else: + _logger.error(_('Unable to handshake.')) + ssl_socket.close() + self._reject_request('Could not start TLS') + raise ConnectionAbortedError('Could not start TLS') + + return ssl_socket + + @asyncio.coroutine + def _rewrap_socket(self, new_socket): + loop = asyncio.get_event_loop() + reader = asyncio.StreamReader(loop=loop) + protocol = asyncio.StreamReaderProtocol(reader, loop=loop) + transport, dummy = yield from loop.create_connection( + lambda: protocol, sock=new_socket) + writer = asyncio.StreamWriter(transport, protocol, reader, loop) + + self._reader = reader + self._writer = writer + @asyncio.coroutine def _read_request_header(self) -> Request: request = Request() diff --git a/wpull/scraper/html.py b/wpull/scraper/html.py index 79312a5b..a1672515 100644 --- a/wpull/scraper/html.py +++ b/wpull/scraper/html.py @@ -20,7 +20,7 @@ _logger = StyleAdapter(logging.getLogger(__name__)) -_BaseLinkInfo = collections.namedtuple( +LinkInfo = collections.namedtuple( 'LinkInfoType', [ 'element', 'tag', 'attrib', 'link', @@ -28,12 +28,7 @@ 'link_type' ] ) - -class LinkInfo(_BaseLinkInfo): - def __hash__(self): - return self.link.__hash__() - -'''Information about a link in a lxml document. Comparable on link only. +'''Information about a link in a lxml document. Attributes: element: An instance of :class:`.document.HTMLReadElement`.