ArchiveTeam · anarcat · Jan 22, 2018 · Nov 2, 2018 · Nov 2, 2018 · Nov 2, 2018
diff --git a/requirements-sphinx.txt b/requirements-sphinx.txt
@@ -1,7 +1,7 @@
 # Should be the same for requirements.txt:
 chardet>=2.0.1,<=2.3
 dnspython3==1.12
-html5lib>=0.999,<1.0
+html5lib>=0.999,<=0.9999999
 # lxml>=3.1.0,<=3.5 # except for this because it requires building C libs
 namedlist>=1.3,<=1.7
 psutil>=2.0,<=4.2

diff --git a/requirements.txt b/requirements.txt
@@ -1,7 +1,7 @@
 # Absolutely known to work versions only:
 chardet>=2.0.1,<=2.3
 dnspython3==1.12
-html5lib>=0.999,<1.0
+html5lib>=0.999,<=0.9999999
 lxml>=3.1.0,<=3.5
 namedlist>=1.3,<=1.7
 psutil>=2.0,<=4.2

diff --git a/wpull/document/htmlparse/html5lib_.py b/wpull/document/htmlparse/html5lib_.py
@@ -4,6 +4,7 @@
 import io
 import os.path
 
+from wpull.collections import FrozenDict, EmptyFrozenDict
 from wpull.document.htmlparse.base import BaseParser
 from wpull.document.htmlparse.element import Comment, Doctype, Element
 
@@ -44,11 +45,11 @@ def parse(self, file, encoding=None):
                     buffer = None
 
                 if tail_buffer:
-                    yield Element(tag, dict(), None, tail_buffer.getvalue(), True)
+                    yield Element(tag, EmptyFrozenDict(), None, tail_buffer.getvalue(), True)
                     tail_buffer = None
 
                 tag = token['name']
-                attrib = dict(token['data'])
+                attrib = FrozenDict(dict(token['data']))
                 buffer = io.StringIO()
 
                 if token['name'] == 'script':
@@ -66,7 +67,7 @@ def parse(self, file, encoding=None):
                     buffer = None
 
                 if tail_buffer:
-                    yield Element(tag, dict(), None, tail_buffer.getvalue(), True)
+                    yield Element(tag, EmptyFrozenDict(), None, tail_buffer.getvalue(), True)
                     tail_buffer = None
 
                 tail_buffer = io.StringIO()
@@ -87,7 +88,7 @@ def parse(self, file, encoding=None):
             buffer = None
 
         if tail_buffer:
-            yield Element(tag, dict(), None, tail_buffer.getvalue(), True)
+            yield Element(tag, EmptyFrozenDict(), None, tail_buffer.getvalue(), True)
             tail_buffer = None
 
 

diff --git a/wpull/document/htmlparse/lxml_.py b/wpull/document/htmlparse/lxml_.py
@@ -3,6 +3,7 @@
 
 import lxml.html
 
+from wpull.collections import EmptyFrozenDict, FrozenDict
 from wpull.document.htmlparse.base import BaseParser
 from wpull.document.htmlparse.element import Element, Comment
 from wpull.document.xml import XMLDetector
@@ -36,15 +37,15 @@ def start(self, tag, attrib):
 
         if self.tail_buffer:
             self.callback(Element(
-                self.tag, dict(),
+                self.tag, EmptyFrozenDict(),
                 None,
                 self.tail_buffer.getvalue(),
                 True
             ))
             self.tail_buffer = None
 
         self.tag = tag
-        self.attrib = attrib
+        self.attrib = FrozenDict(attrib)
         self.buffer = io.StringIO()
 
     def data(self, data):
@@ -65,7 +66,7 @@ def end(self, tag):
 
         if self.tail_buffer:
             self.callback(Element(
-                self.tag, dict(),
+                self.tag, EmptyFrozenDict(),
                 None,
                 self.tail_buffer.getvalue(),
                 True
@@ -89,7 +90,7 @@ def close(self):
 
         if self.tail_buffer:
             self.callback(Element(
-                self.tag, dict(),
+                self.tag, EmptyFrozenDict(),
                 None,
                 self.tail_buffer.getvalue(),
                 True

diff --git a/wpull/proxy/server.py b/wpull/proxy/server.py
@@ -98,6 +98,7 @@ def __init__(self, http_client: Client, reader: asyncio.StreamReader, writer: as
         self._http_client = http_client
         self._reader = self._original_reader = reader
         self._writer = self._original_writer = writer
+        self._is_tunnel = False
         self._is_ssl_tunnel = False
 
         self._cert_filename = wpull.util.get_package_filename('proxy/proxy.crt')
@@ -129,7 +130,7 @@ def _process_request(self, request: Request):
         _logger.debug(__('Got request {0}', request))
 
         if request.method == 'CONNECT':
-            self._reject_request('CONNECT is intentionally not supported')
+            yield from self._start_connect_tunnel()
             return
 
         if self._is_ssl_tunnel and request.url.startswith('http://'):
@@ -202,6 +203,117 @@ def _process_request(self, request: Request):
 
         _logger.debug('Response done.')
 
+    @asyncio.coroutine
+    def _start_connect_tunnel(self):
+        if self._is_tunnel:
+            self._reject_request('Cannot CONNECT within CONNECT')
+            return
+
+        self._is_tunnel = True
+
+        original_socket = yield from self._detach_socket_and_start_tunnel()
+        is_ssl = yield from self._is_client_request_ssl(original_socket)
+
+        if is_ssl:
+            _logger.debug('Tunneling as SSL')
+            yield from self._start_ssl_tunnel()
+        else:
+            yield from self._rewrap_socket(original_socket)
+
+    @classmethod
+    @asyncio.coroutine
+    def _is_client_request_ssl(cls, socket_: socket.socket) -> bool:
+        while True:
+            original_timeout = socket_.gettimeout()
+            socket_.setblocking(False)
+
+            try:
+                data = socket_.recv(3, socket.MSG_PEEK)
+            except OSError as error:
+                if error.errno in (errno.EWOULDBLOCK, errno.EAGAIN):
+                    yield from asyncio.sleep(0.01)
+                else:
+                    raise
+            else:
+                break
+            finally:
+                socket_.settimeout(original_timeout)
+
+        _logger.debug('peeked data %s', data)
+        if all(ord('A') <= char_code <= ord('Z') for char_code in data):
+            return False
+        else:
+            return True
+
+    @asyncio.coroutine
+    def _start_ssl_tunnel(self):
+        '''Start SSL protocol on the socket.'''
+
+        self._is_ssl_tunnel = True
+        ssl_socket = yield from self._start_ssl_handshake()
+        yield from self._rewrap_socket(ssl_socket)
+
+    @asyncio.coroutine
+    def _detach_socket_and_start_tunnel(self) -> socket.socket:
+        socket_ = self._writer.get_extra_info('socket')
+
+        try:
+            asyncio.get_event_loop().remove_reader(socket_.fileno())
+        except ValueError as error:
+            raise ConnectionAbortedError() from error
+
+        self._writer.write(b'HTTP/1.1 200 Connection established\r\n\r\n')
+        yield from self._writer.drain()
+
+        try:
+            asyncio.get_event_loop().remove_writer(socket_.fileno())
+        except ValueError as error:
+            raise ConnectionAbortedError() from error
+
+        return socket_
+
+    @asyncio.coroutine
+    def _start_ssl_handshake(self):
+        socket_ = self._writer.get_extra_info('socket')
+
+        ssl_socket = ssl.wrap_socket(
+            socket_, server_side=True,
+            certfile=self._cert_filename,
+            keyfile=self._key_filename,
+            do_handshake_on_connect=False
+        )
+
+        # FIXME: this isn't how to START TLS
+        for dummy in range(1200):
+            try:
+                ssl_socket.do_handshake()
+                break
+            except ssl.SSLError as error:
+                if error.errno in (ssl.SSL_ERROR_WANT_READ, ssl.SSL_ERROR_WANT_WRITE):
+                    _logger.debug('Do handshake %s', error)
+                    yield from asyncio.sleep(0.05)
+                else:
+                    raise
+        else:
+            _logger.error(_('Unable to handshake.'))
+            ssl_socket.close()
+            self._reject_request('Could not start TLS')
+            raise ConnectionAbortedError('Could not start TLS')
+
+        return ssl_socket
+
+    @asyncio.coroutine
+    def _rewrap_socket(self, new_socket):
+        loop = asyncio.get_event_loop()
+        reader = asyncio.StreamReader(loop=loop)
+        protocol = asyncio.StreamReaderProtocol(reader, loop=loop)
+        transport, dummy = yield from loop.create_connection(
+            lambda: protocol, sock=new_socket)
+        writer = asyncio.StreamWriter(transport, protocol, reader, loop)
+
+        self._reader = reader
+        self._writer = writer
+
     @asyncio.coroutine
     def _read_request_header(self) -> Request:
         request = Request()

diff --git a/wpull/scraper/html.py b/wpull/scraper/html.py
@@ -20,20 +20,15 @@
 _logger = StyleAdapter(logging.getLogger(__name__))
 
 
-_BaseLinkInfo = collections.namedtuple(
+LinkInfo = collections.namedtuple(
     'LinkInfoType',
     [
         'element', 'tag', 'attrib', 'link',
         'inline', 'linked', 'base_link', 'value_type',
         'link_type'
     ]
 )
-
-class LinkInfo(_BaseLinkInfo):
-    def __hash__(self):
-        return self.link.__hash__()
-
-'''Information about a link in a lxml document.  Comparable on link only.
+'''Information about a link in a lxml document.
 
 Attributes:
     element: An instance of :class:`.document.HTMLReadElement`.