[-a-zA-Z0-9.]+/[-a-zA-Z0-9.]+)
- # Match any character set and encoding
- (?:(?:;charset=(?:[-a-zA-Z0-9]+)(?:;(?:base64))?)
- |(?:;(?:base64))?(?:;charset=(?:[-a-zA-Z0-9]+))?)
- # Assume the rest is data
- ,.*
- $
- ''',
- re.VERBOSE)
-
-
-class Filter(base.Filter):
- """ sanitization of XHTML+MathML+SVG and of inline style attributes."""
- def __init__(self,
- source,
- allowed_elements=allowed_elements,
- allowed_attributes=allowed_attributes,
- allowed_css_properties=allowed_css_properties,
- allowed_css_keywords=allowed_css_keywords,
- allowed_svg_properties=allowed_svg_properties,
- allowed_protocols=allowed_protocols,
- allowed_content_types=allowed_content_types,
- attr_val_is_uri=attr_val_is_uri,
- svg_attr_val_allows_ref=svg_attr_val_allows_ref,
- svg_allow_local_href=svg_allow_local_href):
- super(Filter, self).__init__(source)
- self.allowed_elements = allowed_elements
- self.allowed_attributes = allowed_attributes
- self.allowed_css_properties = allowed_css_properties
- self.allowed_css_keywords = allowed_css_keywords
- self.allowed_svg_properties = allowed_svg_properties
- self.allowed_protocols = allowed_protocols
- self.allowed_content_types = allowed_content_types
- self.attr_val_is_uri = attr_val_is_uri
- self.svg_attr_val_allows_ref = svg_attr_val_allows_ref
- self.svg_allow_local_href = svg_allow_local_href
-
- def __iter__(self):
- for token in base.Filter.__iter__(self):
- token = self.sanitize_token(token)
- if token:
- yield token
-
- # Sanitize the +html+, escaping all elements not in ALLOWED_ELEMENTS, and
- # stripping out all # attributes not in ALLOWED_ATTRIBUTES. Style
- # attributes are parsed, and a restricted set, # specified by
- # ALLOWED_CSS_PROPERTIES and ALLOWED_CSS_KEYWORDS, are allowed through.
- # attributes in ATTR_VAL_IS_URI are scanned, and only URI schemes specified
- # in ALLOWED_PROTOCOLS are allowed.
- #
- # sanitize_html('')
- # => <script> do_nasty_stuff() </script>
- # sanitize_html('Click here for $100')
- # => Click here for $100
- def sanitize_token(self, token):
-
- # accommodate filters which use token_type differently
- token_type = token["type"]
- if token_type in ("StartTag", "EndTag", "EmptyTag"):
- name = token["name"]
- namespace = token["namespace"]
- if ((namespace, name) in self.allowed_elements or
- (namespace is None and
- (namespaces["html"], name) in self.allowed_elements)):
- return self.allowed_token(token)
- else:
- return self.disallowed_token(token)
- elif token_type == "Comment":
- pass
- else:
- return token
-
- def allowed_token(self, token):
- if "data" in token:
- attrs = token["data"]
- attr_names = set(attrs.keys())
-
- # Remove forbidden attributes
- for to_remove in (attr_names - self.allowed_attributes):
- del token["data"][to_remove]
- attr_names.remove(to_remove)
-
- # Remove attributes with disallowed URL values
- for attr in (attr_names & self.attr_val_is_uri):
- assert attr in attrs
- # I don't have a clue where this regexp comes from or why it matches those
- # characters, nor why we call unescape. I just know it's always been here.
- # Should you be worried by this comment in a sanitizer? Yes. On the other hand, all
- # this will do is remove *more* than it otherwise would.
- val_unescaped = re.sub("[`\x00-\x20\x7f-\xa0\s]+", '',
- unescape(attrs[attr])).lower()
- # remove replacement characters from unescaped characters
- val_unescaped = val_unescaped.replace("\ufffd", "")
- try:
- uri = urlparse.urlparse(val_unescaped)
- except ValueError:
- uri = None
- del attrs[attr]
- if uri and uri.scheme:
- if uri.scheme not in self.allowed_protocols:
- del attrs[attr]
- if uri.scheme == 'data':
- m = data_content_type.match(uri.path)
- if not m:
- del attrs[attr]
- elif m.group('content_type') not in self.allowed_content_types:
- del attrs[attr]
-
- for attr in self.svg_attr_val_allows_ref:
- if attr in attrs:
- attrs[attr] = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)',
- ' ',
- unescape(attrs[attr]))
- if (token["name"] in self.svg_allow_local_href and
- (namespaces['xlink'], 'href') in attrs and re.search('^\s*[^#\s].*',
- attrs[(namespaces['xlink'], 'href')])):
- del attrs[(namespaces['xlink'], 'href')]
- if (None, 'style') in attrs:
- attrs[(None, 'style')] = self.sanitize_css(attrs[(None, 'style')])
- token["data"] = attrs
- return token
-
- def disallowed_token(self, token):
- token_type = token["type"]
- if token_type == "EndTag":
- token["data"] = "%s>" % token["name"]
- elif token["data"]:
- assert token_type in ("StartTag", "EmptyTag")
- attrs = []
- for (ns, name), v in token["data"].items():
- attrs.append(' %s="%s"' % (name if ns is None else "%s:%s" % (prefixes[ns], name), escape(v)))
- token["data"] = "<%s%s>" % (token["name"], ''.join(attrs))
- else:
- token["data"] = "<%s>" % token["name"]
- if token.get("selfClosing"):
- token["data"] = token["data"][:-1] + "/>"
-
- token["type"] = "Characters"
-
- del token["name"]
- return token
-
- def sanitize_css(self, style):
- # disallow urls
- style = re.compile('url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ', style)
-
- # gauntlet
- if not re.match("""^([:,;#%.\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'|"[\s\w]+"|\([\d,\s]+\))*$""", style):
- return ''
- if not re.match("^\s*([-\w]+\s*:[^:;]*(;\s*|$))*$", style):
- return ''
-
- clean = []
- for prop, value in re.findall("([-\w]+)\s*:\s*([^:;]*)", style):
- if not value:
- continue
- if prop.lower() in self.allowed_css_properties:
- clean.append(prop + ': ' + value + ';')
- elif prop.split('-')[0].lower() in ['background', 'border', 'margin',
- 'padding']:
- for keyword in value.split():
- if keyword not in self.allowed_css_keywords and \
- not re.match("^(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$", keyword): # noqa
- break
- else:
- clean.append(prop + ': ' + value + ';')
- elif prop.lower() in self.allowed_svg_properties:
- clean.append(prop + ': ' + value + ';')
-
- return ' '.join(clean)
diff --git a/EntidadVirtual/horarios/venv/Lib/site-packages/pip-9.0.1-py3.6.egg/pip/_vendor/html5lib/filters/whitespace.py b/EntidadVirtual/horarios/venv/Lib/site-packages/pip-9.0.1-py3.6.egg/pip/_vendor/html5lib/filters/whitespace.py
deleted file mode 100644
index 8921052..0000000
--- a/EntidadVirtual/horarios/venv/Lib/site-packages/pip-9.0.1-py3.6.egg/pip/_vendor/html5lib/filters/whitespace.py
+++ /dev/null
@@ -1,38 +0,0 @@
-from __future__ import absolute_import, division, unicode_literals
-
-import re
-
-from . import base
-from ..constants import rcdataElements, spaceCharacters
-spaceCharacters = "".join(spaceCharacters)
-
-SPACES_REGEX = re.compile("[%s]+" % spaceCharacters)
-
-
-class Filter(base.Filter):
-
- spacePreserveElements = frozenset(["pre", "textarea"] + list(rcdataElements))
-
- def __iter__(self):
- preserve = 0
- for token in base.Filter.__iter__(self):
- type = token["type"]
- if type == "StartTag" \
- and (preserve or token["name"] in self.spacePreserveElements):
- preserve += 1
-
- elif type == "EndTag" and preserve:
- preserve -= 1
-
- elif not preserve and type == "SpaceCharacters" and token["data"]:
- # Test on token["data"] above to not introduce spaces where there were not
- token["data"] = " "
-
- elif not preserve and type == "Characters":
- token["data"] = collapse_spaces(token["data"])
-
- yield token
-
-
-def collapse_spaces(text):
- return SPACES_REGEX.sub(' ', text)
diff --git a/EntidadVirtual/horarios/venv/Lib/site-packages/pip-9.0.1-py3.6.egg/pip/_vendor/html5lib/html5parser.py b/EntidadVirtual/horarios/venv/Lib/site-packages/pip-9.0.1-py3.6.egg/pip/_vendor/html5lib/html5parser.py
deleted file mode 100644
index f7043cb..0000000
--- a/EntidadVirtual/horarios/venv/Lib/site-packages/pip-9.0.1-py3.6.egg/pip/_vendor/html5lib/html5parser.py
+++ /dev/null
@@ -1,2733 +0,0 @@
-from __future__ import absolute_import, division, unicode_literals
-from pip._vendor.six import with_metaclass, viewkeys, PY3
-
-import types
-
-try:
- from collections import OrderedDict
-except ImportError:
- from pip._vendor.ordereddict import OrderedDict
-
-from . import _inputstream
-from . import _tokenizer
-
-from . import treebuilders
-from .treebuilders.base import Marker
-
-from . import _utils
-from .constants import (
- spaceCharacters, asciiUpper2Lower,
- specialElements, headingElements, cdataElements, rcdataElements,
- tokenTypes, tagTokenTypes,
- namespaces,
- htmlIntegrationPointElements, mathmlTextIntegrationPointElements,
- adjustForeignAttributes as adjustForeignAttributesMap,
- adjustMathMLAttributes, adjustSVGAttributes,
- E,
- ReparseException
-)
-
-
-def parse(doc, treebuilder="etree", namespaceHTMLElements=True, **kwargs):
- """Parse a string or file-like object into a tree"""
- tb = treebuilders.getTreeBuilder(treebuilder)
- p = HTMLParser(tb, namespaceHTMLElements=namespaceHTMLElements)
- return p.parse(doc, **kwargs)
-
-
-def parseFragment(doc, container="div", treebuilder="etree", namespaceHTMLElements=True, **kwargs):
- tb = treebuilders.getTreeBuilder(treebuilder)
- p = HTMLParser(tb, namespaceHTMLElements=namespaceHTMLElements)
- return p.parseFragment(doc, container=container, **kwargs)
-
-
-def method_decorator_metaclass(function):
- class Decorated(type):
- def __new__(meta, classname, bases, classDict):
- for attributeName, attribute in classDict.items():
- if isinstance(attribute, types.FunctionType):
- attribute = function(attribute)
-
- classDict[attributeName] = attribute
- return type.__new__(meta, classname, bases, classDict)
- return Decorated
-
-
-class HTMLParser(object):
- """HTML parser. Generates a tree structure from a stream of (possibly
- malformed) HTML"""
-
- def __init__(self, tree=None, strict=False, namespaceHTMLElements=True, debug=False):
- """
- strict - raise an exception when a parse error is encountered
-
- tree - a treebuilder class controlling the type of tree that will be
- returned. Built in treebuilders can be accessed through
- html5lib.treebuilders.getTreeBuilder(treeType)
- """
-
- # Raise an exception on the first error encountered
- self.strict = strict
-
- if tree is None:
- tree = treebuilders.getTreeBuilder("etree")
- self.tree = tree(namespaceHTMLElements)
- self.errors = []
-
- self.phases = dict([(name, cls(self, self.tree)) for name, cls in
- getPhases(debug).items()])
-
- def _parse(self, stream, innerHTML=False, container="div", scripting=False, **kwargs):
-
- self.innerHTMLMode = innerHTML
- self.container = container
- self.scripting = scripting
- self.tokenizer = _tokenizer.HTMLTokenizer(stream, parser=self, **kwargs)
- self.reset()
-
- try:
- self.mainLoop()
- except ReparseException:
- self.reset()
- self.mainLoop()
-
- def reset(self):
- self.tree.reset()
- self.firstStartTag = False
- self.errors = []
- self.log = [] # only used with debug mode
- # "quirks" / "limited quirks" / "no quirks"
- self.compatMode = "no quirks"
-
- if self.innerHTMLMode:
- self.innerHTML = self.container.lower()
-
- if self.innerHTML in cdataElements:
- self.tokenizer.state = self.tokenizer.rcdataState
- elif self.innerHTML in rcdataElements:
- self.tokenizer.state = self.tokenizer.rawtextState
- elif self.innerHTML == 'plaintext':
- self.tokenizer.state = self.tokenizer.plaintextState
- else:
- # state already is data state
- # self.tokenizer.state = self.tokenizer.dataState
- pass
- self.phase = self.phases["beforeHtml"]
- self.phase.insertHtmlElement()
- self.resetInsertionMode()
- else:
- self.innerHTML = False # pylint:disable=redefined-variable-type
- self.phase = self.phases["initial"]
-
- self.lastPhase = None
-
- self.beforeRCDataPhase = None
-
- self.framesetOK = True
-
- @property
- def documentEncoding(self):
- """The name of the character encoding
- that was used to decode the input stream,
- or :obj:`None` if that is not determined yet.
-
- """
- if not hasattr(self, 'tokenizer'):
- return None
- return self.tokenizer.stream.charEncoding[0].name
-
- def isHTMLIntegrationPoint(self, element):
- if (element.name == "annotation-xml" and
- element.namespace == namespaces["mathml"]):
- return ("encoding" in element.attributes and
- element.attributes["encoding"].translate(
- asciiUpper2Lower) in
- ("text/html", "application/xhtml+xml"))
- else:
- return (element.namespace, element.name) in htmlIntegrationPointElements
-
- def isMathMLTextIntegrationPoint(self, element):
- return (element.namespace, element.name) in mathmlTextIntegrationPointElements
-
- def mainLoop(self):
- CharactersToken = tokenTypes["Characters"]
- SpaceCharactersToken = tokenTypes["SpaceCharacters"]
- StartTagToken = tokenTypes["StartTag"]
- EndTagToken = tokenTypes["EndTag"]
- CommentToken = tokenTypes["Comment"]
- DoctypeToken = tokenTypes["Doctype"]
- ParseErrorToken = tokenTypes["ParseError"]
-
- for token in self.normalizedTokens():
- prev_token = None
- new_token = token
- while new_token is not None:
- prev_token = new_token
- currentNode = self.tree.openElements[-1] if self.tree.openElements else None
- currentNodeNamespace = currentNode.namespace if currentNode else None
- currentNodeName = currentNode.name if currentNode else None
-
- type = new_token["type"]
-
- if type == ParseErrorToken:
- self.parseError(new_token["data"], new_token.get("datavars", {}))
- new_token = None
- else:
- if (len(self.tree.openElements) == 0 or
- currentNodeNamespace == self.tree.defaultNamespace or
- (self.isMathMLTextIntegrationPoint(currentNode) and
- ((type == StartTagToken and
- token["name"] not in frozenset(["mglyph", "malignmark"])) or
- type in (CharactersToken, SpaceCharactersToken))) or
- (currentNodeNamespace == namespaces["mathml"] and
- currentNodeName == "annotation-xml" and
- type == StartTagToken and
- token["name"] == "svg") or
- (self.isHTMLIntegrationPoint(currentNode) and
- type in (StartTagToken, CharactersToken, SpaceCharactersToken))):
- phase = self.phase
- else:
- phase = self.phases["inForeignContent"]
-
- if type == CharactersToken:
- new_token = phase.processCharacters(new_token)
- elif type == SpaceCharactersToken:
- new_token = phase.processSpaceCharacters(new_token)
- elif type == StartTagToken:
- new_token = phase.processStartTag(new_token)
- elif type == EndTagToken:
- new_token = phase.processEndTag(new_token)
- elif type == CommentToken:
- new_token = phase.processComment(new_token)
- elif type == DoctypeToken:
- new_token = phase.processDoctype(new_token)
-
- if (type == StartTagToken and prev_token["selfClosing"] and
- not prev_token["selfClosingAcknowledged"]):
- self.parseError("non-void-element-with-trailing-solidus",
- {"name": prev_token["name"]})
-
- # When the loop finishes it's EOF
- reprocess = True
- phases = []
- while reprocess:
- phases.append(self.phase)
- reprocess = self.phase.processEOF()
- if reprocess:
- assert self.phase not in phases
-
- def normalizedTokens(self):
- for token in self.tokenizer:
- yield self.normalizeToken(token)
-
- def parse(self, stream, *args, **kwargs):
- """Parse a HTML document into a well-formed tree
-
- stream - a filelike object or string containing the HTML to be parsed
-
- The optional encoding parameter must be a string that indicates
- the encoding. If specified, that encoding will be used,
- regardless of any BOM or later declaration (such as in a meta
- element)
-
- scripting - treat noscript elements as if javascript was turned on
- """
- self._parse(stream, False, None, *args, **kwargs)
- return self.tree.getDocument()
-
- def parseFragment(self, stream, *args, **kwargs):
- """Parse a HTML fragment into a well-formed tree fragment
-
- container - name of the element we're setting the innerHTML property
- if set to None, default to 'div'
-
- stream - a filelike object or string containing the HTML to be parsed
-
- The optional encoding parameter must be a string that indicates
- the encoding. If specified, that encoding will be used,
- regardless of any BOM or later declaration (such as in a meta
- element)
-
- scripting - treat noscript elements as if javascript was turned on
- """
- self._parse(stream, True, *args, **kwargs)
- return self.tree.getFragment()
-
- def parseError(self, errorcode="XXX-undefined-error", datavars=None):
- # XXX The idea is to make errorcode mandatory.
- if datavars is None:
- datavars = {}
- self.errors.append((self.tokenizer.stream.position(), errorcode, datavars))
- if self.strict:
- raise ParseError(E[errorcode] % datavars)
-
- def normalizeToken(self, token):
- """ HTML5 specific normalizations to the token stream """
-
- if token["type"] == tokenTypes["StartTag"]:
- raw = token["data"]
- token["data"] = OrderedDict(raw)
- if len(raw) > len(token["data"]):
- # we had some duplicated attribute, fix so first wins
- token["data"].update(raw[::-1])
-
- return token
-
- def adjustMathMLAttributes(self, token):
- adjust_attributes(token, adjustMathMLAttributes)
-
- def adjustSVGAttributes(self, token):
- adjust_attributes(token, adjustSVGAttributes)
-
- def adjustForeignAttributes(self, token):
- adjust_attributes(token, adjustForeignAttributesMap)
-
- def reparseTokenNormal(self, token):
- # pylint:disable=unused-argument
- self.parser.phase()
-
- def resetInsertionMode(self):
- # The name of this method is mostly historical. (It's also used in the
- # specification.)
- last = False
- newModes = {
- "select": "inSelect",
- "td": "inCell",
- "th": "inCell",
- "tr": "inRow",
- "tbody": "inTableBody",
- "thead": "inTableBody",
- "tfoot": "inTableBody",
- "caption": "inCaption",
- "colgroup": "inColumnGroup",
- "table": "inTable",
- "head": "inBody",
- "body": "inBody",
- "frameset": "inFrameset",
- "html": "beforeHead"
- }
- for node in self.tree.openElements[::-1]:
- nodeName = node.name
- new_phase = None
- if node == self.tree.openElements[0]:
- assert self.innerHTML
- last = True
- nodeName = self.innerHTML
- # Check for conditions that should only happen in the innerHTML
- # case
- if nodeName in ("select", "colgroup", "head", "html"):
- assert self.innerHTML
-
- if not last and node.namespace != self.tree.defaultNamespace:
- continue
-
- if nodeName in newModes:
- new_phase = self.phases[newModes[nodeName]]
- break
- elif last:
- new_phase = self.phases["inBody"]
- break
-
- self.phase = new_phase
-
- def parseRCDataRawtext(self, token, contentType):
- """Generic RCDATA/RAWTEXT Parsing algorithm
- contentType - RCDATA or RAWTEXT
- """
- assert contentType in ("RAWTEXT", "RCDATA")
-
- self.tree.insertElement(token)
-
- if contentType == "RAWTEXT":
- self.tokenizer.state = self.tokenizer.rawtextState
- else:
- self.tokenizer.state = self.tokenizer.rcdataState
-
- self.originalPhase = self.phase
-
- self.phase = self.phases["text"]
-
-
-@_utils.memoize
-def getPhases(debug):
- def log(function):
- """Logger that records which phase processes each token"""
- type_names = dict((value, key) for key, value in
- tokenTypes.items())
-
- def wrapped(self, *args, **kwargs):
- if function.__name__.startswith("process") and len(args) > 0:
- token = args[0]
- try:
- info = {"type": type_names[token['type']]}
- except:
- raise
- if token['type'] in tagTokenTypes:
- info["name"] = token['name']
-
- self.parser.log.append((self.parser.tokenizer.state.__name__,
- self.parser.phase.__class__.__name__,
- self.__class__.__name__,
- function.__name__,
- info))
- return function(self, *args, **kwargs)
- else:
- return function(self, *args, **kwargs)
- return wrapped
-
- def getMetaclass(use_metaclass, metaclass_func):
- if use_metaclass:
- return method_decorator_metaclass(metaclass_func)
- else:
- return type
-
- # pylint:disable=unused-argument
- class Phase(with_metaclass(getMetaclass(debug, log))):
- """Base class for helper object that implements each phase of processing
- """
-
- def __init__(self, parser, tree):
- self.parser = parser
- self.tree = tree
-
- def processEOF(self):
- raise NotImplementedError
-
- def processComment(self, token):
- # For most phases the following is correct. Where it's not it will be
- # overridden.
- self.tree.insertComment(token, self.tree.openElements[-1])
-
- def processDoctype(self, token):
- self.parser.parseError("unexpected-doctype")
-
- def processCharacters(self, token):
- self.tree.insertText(token["data"])
-
- def processSpaceCharacters(self, token):
- self.tree.insertText(token["data"])
-
- def processStartTag(self, token):
- return self.startTagHandler[token["name"]](token)
-
- def startTagHtml(self, token):
- if not self.parser.firstStartTag and token["name"] == "html":
- self.parser.parseError("non-html-root")
- # XXX Need a check here to see if the first start tag token emitted is
- # this token... If it's not, invoke self.parser.parseError().
- for attr, value in token["data"].items():
- if attr not in self.tree.openElements[0].attributes:
- self.tree.openElements[0].attributes[attr] = value
- self.parser.firstStartTag = False
-
- def processEndTag(self, token):
- return self.endTagHandler[token["name"]](token)
-
- class InitialPhase(Phase):
- def processSpaceCharacters(self, token):
- pass
-
- def processComment(self, token):
- self.tree.insertComment(token, self.tree.document)
-
- def processDoctype(self, token):
- name = token["name"]
- publicId = token["publicId"]
- systemId = token["systemId"]
- correct = token["correct"]
-
- if (name != "html" or publicId is not None or
- systemId is not None and systemId != "about:legacy-compat"):
- self.parser.parseError("unknown-doctype")
-
- if publicId is None:
- publicId = ""
-
- self.tree.insertDoctype(token)
-
- if publicId != "":
- publicId = publicId.translate(asciiUpper2Lower)
-
- if (not correct or token["name"] != "html" or
- publicId.startswith(
- ("+//silmaril//dtd html pro v0r11 19970101//",
- "-//advasoft ltd//dtd html 3.0 aswedit + extensions//",
- "-//as//dtd html 3.0 aswedit + extensions//",
- "-//ietf//dtd html 2.0 level 1//",
- "-//ietf//dtd html 2.0 level 2//",
- "-//ietf//dtd html 2.0 strict level 1//",
- "-//ietf//dtd html 2.0 strict level 2//",
- "-//ietf//dtd html 2.0 strict//",
- "-//ietf//dtd html 2.0//",
- "-//ietf//dtd html 2.1e//",
- "-//ietf//dtd html 3.0//",
- "-//ietf//dtd html 3.2 final//",
- "-//ietf//dtd html 3.2//",
- "-//ietf//dtd html 3//",
- "-//ietf//dtd html level 0//",
- "-//ietf//dtd html level 1//",
- "-//ietf//dtd html level 2//",
- "-//ietf//dtd html level 3//",
- "-//ietf//dtd html strict level 0//",
- "-//ietf//dtd html strict level 1//",
- "-//ietf//dtd html strict level 2//",
- "-//ietf//dtd html strict level 3//",
- "-//ietf//dtd html strict//",
- "-//ietf//dtd html//",
- "-//metrius//dtd metrius presentational//",
- "-//microsoft//dtd internet explorer 2.0 html strict//",
- "-//microsoft//dtd internet explorer 2.0 html//",
- "-//microsoft//dtd internet explorer 2.0 tables//",
- "-//microsoft//dtd internet explorer 3.0 html strict//",
- "-//microsoft//dtd internet explorer 3.0 html//",
- "-//microsoft//dtd internet explorer 3.0 tables//",
- "-//netscape comm. corp.//dtd html//",
- "-//netscape comm. corp.//dtd strict html//",
- "-//o'reilly and associates//dtd html 2.0//",
- "-//o'reilly and associates//dtd html extended 1.0//",
- "-//o'reilly and associates//dtd html extended relaxed 1.0//",
- "-//softquad software//dtd hotmetal pro 6.0::19990601::extensions to html 4.0//",
- "-//softquad//dtd hotmetal pro 4.0::19971010::extensions to html 4.0//",
- "-//spyglass//dtd html 2.0 extended//",
- "-//sq//dtd html 2.0 hotmetal + extensions//",
- "-//sun microsystems corp.//dtd hotjava html//",
- "-//sun microsystems corp.//dtd hotjava strict html//",
- "-//w3c//dtd html 3 1995-03-24//",
- "-//w3c//dtd html 3.2 draft//",
- "-//w3c//dtd html 3.2 final//",
- "-//w3c//dtd html 3.2//",
- "-//w3c//dtd html 3.2s draft//",
- "-//w3c//dtd html 4.0 frameset//",
- "-//w3c//dtd html 4.0 transitional//",
- "-//w3c//dtd html experimental 19960712//",
- "-//w3c//dtd html experimental 970421//",
- "-//w3c//dtd w3 html//",
- "-//w3o//dtd w3 html 3.0//",
- "-//webtechs//dtd mozilla html 2.0//",
- "-//webtechs//dtd mozilla html//")) or
- publicId in ("-//w3o//dtd w3 html strict 3.0//en//",
- "-/w3c/dtd html 4.0 transitional/en",
- "html") or
- publicId.startswith(
- ("-//w3c//dtd html 4.01 frameset//",
- "-//w3c//dtd html 4.01 transitional//")) and
- systemId is None or
- systemId and systemId.lower() == "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd"):
- self.parser.compatMode = "quirks"
- elif (publicId.startswith(
- ("-//w3c//dtd xhtml 1.0 frameset//",
- "-//w3c//dtd xhtml 1.0 transitional//")) or
- publicId.startswith(
- ("-//w3c//dtd html 4.01 frameset//",
- "-//w3c//dtd html 4.01 transitional//")) and
- systemId is not None):
- self.parser.compatMode = "limited quirks"
-
- self.parser.phase = self.parser.phases["beforeHtml"]
-
- def anythingElse(self):
- self.parser.compatMode = "quirks"
- self.parser.phase = self.parser.phases["beforeHtml"]
-
- def processCharacters(self, token):
- self.parser.parseError("expected-doctype-but-got-chars")
- self.anythingElse()
- return token
-
- def processStartTag(self, token):
- self.parser.parseError("expected-doctype-but-got-start-tag",
- {"name": token["name"]})
- self.anythingElse()
- return token
-
- def processEndTag(self, token):
- self.parser.parseError("expected-doctype-but-got-end-tag",
- {"name": token["name"]})
- self.anythingElse()
- return token
-
- def processEOF(self):
- self.parser.parseError("expected-doctype-but-got-eof")
- self.anythingElse()
- return True
-
- class BeforeHtmlPhase(Phase):
- # helper methods
- def insertHtmlElement(self):
- self.tree.insertRoot(impliedTagToken("html", "StartTag"))
- self.parser.phase = self.parser.phases["beforeHead"]
-
- # other
- def processEOF(self):
- self.insertHtmlElement()
- return True
-
- def processComment(self, token):
- self.tree.insertComment(token, self.tree.document)
-
- def processSpaceCharacters(self, token):
- pass
-
- def processCharacters(self, token):
- self.insertHtmlElement()
- return token
-
- def processStartTag(self, token):
- if token["name"] == "html":
- self.parser.firstStartTag = True
- self.insertHtmlElement()
- return token
-
- def processEndTag(self, token):
- if token["name"] not in ("head", "body", "html", "br"):
- self.parser.parseError("unexpected-end-tag-before-html",
- {"name": token["name"]})
- else:
- self.insertHtmlElement()
- return token
-
- class BeforeHeadPhase(Phase):
- def __init__(self, parser, tree):
- Phase.__init__(self, parser, tree)
-
- self.startTagHandler = _utils.MethodDispatcher([
- ("html", self.startTagHtml),
- ("head", self.startTagHead)
- ])
- self.startTagHandler.default = self.startTagOther
-
- self.endTagHandler = _utils.MethodDispatcher([
- (("head", "body", "html", "br"), self.endTagImplyHead)
- ])
- self.endTagHandler.default = self.endTagOther
-
- def processEOF(self):
- self.startTagHead(impliedTagToken("head", "StartTag"))
- return True
-
- def processSpaceCharacters(self, token):
- pass
-
- def processCharacters(self, token):
- self.startTagHead(impliedTagToken("head", "StartTag"))
- return token
-
- def startTagHtml(self, token):
- return self.parser.phases["inBody"].processStartTag(token)
-
- def startTagHead(self, token):
- self.tree.insertElement(token)
- self.tree.headPointer = self.tree.openElements[-1]
- self.parser.phase = self.parser.phases["inHead"]
-
- def startTagOther(self, token):
- self.startTagHead(impliedTagToken("head", "StartTag"))
- return token
-
- def endTagImplyHead(self, token):
- self.startTagHead(impliedTagToken("head", "StartTag"))
- return token
-
- def endTagOther(self, token):
- self.parser.parseError("end-tag-after-implied-root",
- {"name": token["name"]})
-
- class InHeadPhase(Phase):
- def __init__(self, parser, tree):
- Phase.__init__(self, parser, tree)
-
- self.startTagHandler = _utils.MethodDispatcher([
- ("html", self.startTagHtml),
- ("title", self.startTagTitle),
- (("noframes", "style"), self.startTagNoFramesStyle),
- ("noscript", self.startTagNoscript),
- ("script", self.startTagScript),
- (("base", "basefont", "bgsound", "command", "link"),
- self.startTagBaseLinkCommand),
- ("meta", self.startTagMeta),
- ("head", self.startTagHead)
- ])
- self.startTagHandler.default = self.startTagOther
-
- self.endTagHandler = _utils.MethodDispatcher([
- ("head", self.endTagHead),
- (("br", "html", "body"), self.endTagHtmlBodyBr)
- ])
- self.endTagHandler.default = self.endTagOther
-
- # the real thing
- def processEOF(self):
- self.anythingElse()
- return True
-
- def processCharacters(self, token):
- self.anythingElse()
- return token
-
- def startTagHtml(self, token):
- return self.parser.phases["inBody"].processStartTag(token)
-
- def startTagHead(self, token):
- self.parser.parseError("two-heads-are-not-better-than-one")
-
- def startTagBaseLinkCommand(self, token):
- self.tree.insertElement(token)
- self.tree.openElements.pop()
- token["selfClosingAcknowledged"] = True
-
- def startTagMeta(self, token):
- self.tree.insertElement(token)
- self.tree.openElements.pop()
- token["selfClosingAcknowledged"] = True
-
- attributes = token["data"]
- if self.parser.tokenizer.stream.charEncoding[1] == "tentative":
- if "charset" in attributes:
- self.parser.tokenizer.stream.changeEncoding(attributes["charset"])
- elif ("content" in attributes and
- "http-equiv" in attributes and
- attributes["http-equiv"].lower() == "content-type"):
- # Encoding it as UTF-8 here is a hack, as really we should pass
- # the abstract Unicode string, and just use the
- # ContentAttrParser on that, but using UTF-8 allows all chars
- # to be encoded and as a ASCII-superset works.
- data = _inputstream.EncodingBytes(attributes["content"].encode("utf-8"))
- parser = _inputstream.ContentAttrParser(data)
- codec = parser.parse()
- self.parser.tokenizer.stream.changeEncoding(codec)
-
- def startTagTitle(self, token):
- self.parser.parseRCDataRawtext(token, "RCDATA")
-
- def startTagNoFramesStyle(self, token):
- # Need to decide whether to implement the scripting-disabled case
- self.parser.parseRCDataRawtext(token, "RAWTEXT")
-
- def startTagNoscript(self, token):
- if self.parser.scripting:
- self.parser.parseRCDataRawtext(token, "RAWTEXT")
- else:
- self.tree.insertElement(token)
- self.parser.phase = self.parser.phases["inHeadNoscript"]
-
- def startTagScript(self, token):
- self.tree.insertElement(token)
- self.parser.tokenizer.state = self.parser.tokenizer.scriptDataState
- self.parser.originalPhase = self.parser.phase
- self.parser.phase = self.parser.phases["text"]
-
- def startTagOther(self, token):
- self.anythingElse()
- return token
-
- def endTagHead(self, token):
- node = self.parser.tree.openElements.pop()
- assert node.name == "head", "Expected head got %s" % node.name
- self.parser.phase = self.parser.phases["afterHead"]
-
- def endTagHtmlBodyBr(self, token):
- self.anythingElse()
- return token
-
- def endTagOther(self, token):
- self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
-
- def anythingElse(self):
- self.endTagHead(impliedTagToken("head"))
-
- class InHeadNoscriptPhase(Phase):
- def __init__(self, parser, tree):
- Phase.__init__(self, parser, tree)
-
- self.startTagHandler = _utils.MethodDispatcher([
- ("html", self.startTagHtml),
- (("basefont", "bgsound", "link", "meta", "noframes", "style"), self.startTagBaseLinkCommand),
- (("head", "noscript"), self.startTagHeadNoscript),
- ])
- self.startTagHandler.default = self.startTagOther
-
- self.endTagHandler = _utils.MethodDispatcher([
- ("noscript", self.endTagNoscript),
- ("br", self.endTagBr),
- ])
- self.endTagHandler.default = self.endTagOther
-
- def processEOF(self):
- self.parser.parseError("eof-in-head-noscript")
- self.anythingElse()
- return True
-
- def processComment(self, token):
- return self.parser.phases["inHead"].processComment(token)
-
- def processCharacters(self, token):
- self.parser.parseError("char-in-head-noscript")
- self.anythingElse()
- return token
-
- def processSpaceCharacters(self, token):
- return self.parser.phases["inHead"].processSpaceCharacters(token)
-
- def startTagHtml(self, token):
- return self.parser.phases["inBody"].processStartTag(token)
-
- def startTagBaseLinkCommand(self, token):
- return self.parser.phases["inHead"].processStartTag(token)
-
- def startTagHeadNoscript(self, token):
- self.parser.parseError("unexpected-start-tag", {"name": token["name"]})
-
- def startTagOther(self, token):
- self.parser.parseError("unexpected-inhead-noscript-tag", {"name": token["name"]})
- self.anythingElse()
- return token
-
- def endTagNoscript(self, token):
- node = self.parser.tree.openElements.pop()
- assert node.name == "noscript", "Expected noscript got %s" % node.name
- self.parser.phase = self.parser.phases["inHead"]
-
- def endTagBr(self, token):
- self.parser.parseError("unexpected-inhead-noscript-tag", {"name": token["name"]})
- self.anythingElse()
- return token
-
- def endTagOther(self, token):
- self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
-
- def anythingElse(self):
- # Caller must raise parse error first!
- self.endTagNoscript(impliedTagToken("noscript"))
-
- class AfterHeadPhase(Phase):
- def __init__(self, parser, tree):
- Phase.__init__(self, parser, tree)
-
- self.startTagHandler = _utils.MethodDispatcher([
- ("html", self.startTagHtml),
- ("body", self.startTagBody),
- ("frameset", self.startTagFrameset),
- (("base", "basefont", "bgsound", "link", "meta", "noframes", "script",
- "style", "title"),
- self.startTagFromHead),
- ("head", self.startTagHead)
- ])
- self.startTagHandler.default = self.startTagOther
- self.endTagHandler = _utils.MethodDispatcher([(("body", "html", "br"),
- self.endTagHtmlBodyBr)])
- self.endTagHandler.default = self.endTagOther
-
- def processEOF(self):
- self.anythingElse()
- return True
-
- def processCharacters(self, token):
- self.anythingElse()
- return token
-
- def startTagHtml(self, token):
- return self.parser.phases["inBody"].processStartTag(token)
-
- def startTagBody(self, token):
- self.parser.framesetOK = False
- self.tree.insertElement(token)
- self.parser.phase = self.parser.phases["inBody"]
-
- def startTagFrameset(self, token):
- self.tree.insertElement(token)
- self.parser.phase = self.parser.phases["inFrameset"]
-
- def startTagFromHead(self, token):
- self.parser.parseError("unexpected-start-tag-out-of-my-head",
- {"name": token["name"]})
- self.tree.openElements.append(self.tree.headPointer)
- self.parser.phases["inHead"].processStartTag(token)
- for node in self.tree.openElements[::-1]:
- if node.name == "head":
- self.tree.openElements.remove(node)
- break
-
- def startTagHead(self, token):
- self.parser.parseError("unexpected-start-tag", {"name": token["name"]})
-
- def startTagOther(self, token):
- self.anythingElse()
- return token
-
- def endTagHtmlBodyBr(self, token):
- self.anythingElse()
- return token
-
- def endTagOther(self, token):
- self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
-
- def anythingElse(self):
- self.tree.insertElement(impliedTagToken("body", "StartTag"))
- self.parser.phase = self.parser.phases["inBody"]
- self.parser.framesetOK = True
-
- class InBodyPhase(Phase):
- # http://www.whatwg.org/specs/web-apps/current-work/#parsing-main-inbody
- # the really-really-really-very crazy mode
- def __init__(self, parser, tree):
- Phase.__init__(self, parser, tree)
-
- # Set this to the default handler
- self.processSpaceCharacters = self.processSpaceCharactersNonPre
-
- self.startTagHandler = _utils.MethodDispatcher([
- ("html", self.startTagHtml),
- (("base", "basefont", "bgsound", "command", "link", "meta",
- "script", "style", "title"),
- self.startTagProcessInHead),
- ("body", self.startTagBody),
- ("frameset", self.startTagFrameset),
- (("address", "article", "aside", "blockquote", "center", "details",
- "dir", "div", "dl", "fieldset", "figcaption", "figure",
- "footer", "header", "hgroup", "main", "menu", "nav", "ol", "p",
- "section", "summary", "ul"),
- self.startTagCloseP),
- (headingElements, self.startTagHeading),
- (("pre", "listing"), self.startTagPreListing),
- ("form", self.startTagForm),
- (("li", "dd", "dt"), self.startTagListItem),
- ("plaintext", self.startTagPlaintext),
- ("a", self.startTagA),
- (("b", "big", "code", "em", "font", "i", "s", "small", "strike",
- "strong", "tt", "u"), self.startTagFormatting),
- ("nobr", self.startTagNobr),
- ("button", self.startTagButton),
- (("applet", "marquee", "object"), self.startTagAppletMarqueeObject),
- ("xmp", self.startTagXmp),
- ("table", self.startTagTable),
- (("area", "br", "embed", "img", "keygen", "wbr"),
- self.startTagVoidFormatting),
- (("param", "source", "track"), self.startTagParamSource),
- ("input", self.startTagInput),
- ("hr", self.startTagHr),
- ("image", self.startTagImage),
- ("isindex", self.startTagIsIndex),
- ("textarea", self.startTagTextarea),
- ("iframe", self.startTagIFrame),
- ("noscript", self.startTagNoscript),
- (("noembed", "noframes"), self.startTagRawtext),
- ("select", self.startTagSelect),
- (("rp", "rt"), self.startTagRpRt),
- (("option", "optgroup"), self.startTagOpt),
- (("math"), self.startTagMath),
- (("svg"), self.startTagSvg),
- (("caption", "col", "colgroup", "frame", "head",
- "tbody", "td", "tfoot", "th", "thead",
- "tr"), self.startTagMisplaced)
- ])
- self.startTagHandler.default = self.startTagOther
-
- self.endTagHandler = _utils.MethodDispatcher([
- ("body", self.endTagBody),
- ("html", self.endTagHtml),
- (("address", "article", "aside", "blockquote", "button", "center",
- "details", "dialog", "dir", "div", "dl", "fieldset", "figcaption", "figure",
- "footer", "header", "hgroup", "listing", "main", "menu", "nav", "ol", "pre",
- "section", "summary", "ul"), self.endTagBlock),
- ("form", self.endTagForm),
- ("p", self.endTagP),
- (("dd", "dt", "li"), self.endTagListItem),
- (headingElements, self.endTagHeading),
- (("a", "b", "big", "code", "em", "font", "i", "nobr", "s", "small",
- "strike", "strong", "tt", "u"), self.endTagFormatting),
- (("applet", "marquee", "object"), self.endTagAppletMarqueeObject),
- ("br", self.endTagBr),
- ])
- self.endTagHandler.default = self.endTagOther
-
- def isMatchingFormattingElement(self, node1, node2):
- return (node1.name == node2.name and
- node1.namespace == node2.namespace and
- node1.attributes == node2.attributes)
-
- # helper
- def addFormattingElement(self, token):
- self.tree.insertElement(token)
- element = self.tree.openElements[-1]
-
- matchingElements = []
- for node in self.tree.activeFormattingElements[::-1]:
- if node is Marker:
- break
- elif self.isMatchingFormattingElement(node, element):
- matchingElements.append(node)
-
- assert len(matchingElements) <= 3
- if len(matchingElements) == 3:
- self.tree.activeFormattingElements.remove(matchingElements[-1])
- self.tree.activeFormattingElements.append(element)
-
- # the real deal
- def processEOF(self):
- allowed_elements = frozenset(("dd", "dt", "li", "p", "tbody", "td",
- "tfoot", "th", "thead", "tr", "body",
- "html"))
- for node in self.tree.openElements[::-1]:
- if node.name not in allowed_elements:
- self.parser.parseError("expected-closing-tag-but-got-eof")
- break
- # Stop parsing
-
- def processSpaceCharactersDropNewline(self, token):
- # Sometimes (start of , , and