From cbfee5319060cf9ab98f935ec1deee8bb10c5b68 Mon Sep 17 00:00:00 2001 From: jesko Date: Sat, 28 Sep 2024 00:33:43 +0200 Subject: [PATCH] fixes bugs after xml changes --- refinery/lib/xml.py | 50 +++++++++++++++++++++------------- refinery/units/formats/html.py | 35 +++++++++--------------- refinery/units/formats/xml.py | 2 -- 3 files changed, 44 insertions(+), 43 deletions(-) diff --git a/refinery/lib/xml.py b/refinery/lib/xml.py index edd17ca1e6..f2f16b10d0 100644 --- a/refinery/lib/xml.py +++ b/refinery/lib/xml.py @@ -11,13 +11,16 @@ import defusedxml.ElementTree as et import collections -from typing import Any, Dict, Iterable, List, Optional +from typing import Any, Dict, Iterable, List, Optional, TYPE_CHECKING from xml.parsers import expat from xml.etree.ElementTree import Element, ElementTree from refinery.lib.structures import MemoryFile from refinery.lib.tools import exception_to_string +if TYPE_CHECKING: + from typing import Self + def ForgivingParse(data: bytes, entities=None) -> ElementTree: """ @@ -82,18 +85,18 @@ class XMLNodeBase: __slots__ = 'tag', 'index', 'children', 'empty', 'attributes', 'content', '_parent', '__weakref__' attributes: Dict[str, Any] - children: List[XMLNodeBase] + children: List[Self] content: Optional[str] - parent: Optional[weakref.ProxyType[XMLNodeBase]] - subtree: Iterable[XMLNodeBase] + parent: Optional[weakref.ProxyType[Self]] + subtree: Iterable[Self] empty: bool tag: Optional[str] def __init__( self, - tag: str, - index: Optional[int], - parent: Optional[XMLNodeBase] = None, + tag: Optional[str], + index: Optional[int] = None, + parent: Optional[Self] = None, content: Optional[str] = None, empty: bool = False, attributes: Optional[Dict[str, Any]] = None, @@ -123,9 +126,6 @@ def parent(self, parent): parent = weakref.ref(parent) self._parent = parent - def __hash__(self): - return hash((hash(self.parent), self.tag, self.index)) - def __eq__(self, other: XMLNodeBase): return self.parent == other.parent and self.tag == other.tag and self.index == other.index @@ -155,6 +155,21 @@ def __getitem__(self, key): def get_attribute(self, key, default=None): return self.attributes.get(key, default) + def reindex(self): + """ + Computes the index values of all nodes in the subtree. + """ + pre_count = collections.Counter(child.tag for child in self.children) + tag_count = collections.Counter() + for child in self.children: + tag = child.tag + if pre_count[tag] == 1: + child.index = None + else: + tag_count[tag] += 1 + child.index = tag_count[tag] + child.reindex() + def child(self, tag: str): """ Return the first child with the given tag. This is useful especialyl for documents where @@ -166,7 +181,7 @@ def child(self, tag: str): raise LookupError(tag) @property - def subtree(self) -> Iterable[XMLNodeBase]: + def subtree(self) -> Iterable[Self]: """ Iterate all items that are reachable from the current node. """ @@ -189,8 +204,8 @@ class XMLNode(XMLNodeBase): source: Optional[Element] - def __init__(self, tag: str, index: int, parent: Optional[XMLNode] = None, source: Optional[Element] = None): - super().__init__(tag, index, parent) + def __init__(self, tag: str, parent: Optional[Self] = None, source: Optional[Element] = None): + super().__init__(tag, parent=parent) self.source = source def write(self, stream): @@ -209,19 +224,16 @@ def parse(data) -> XMLNode: tree that is generated by the standard library. """ def translate(element: Element, cursor: XMLNode, level: int = 0): - total = collections.Counter(child.tag for child in element) - count = collections.Counter() for child in element: tag = child.tag - index = None if total[tag] == 1 else count[tag] - node = XMLNode(tag, index, cursor, child) - count[tag] += 1 + node = XMLNode(tag, cursor, child) translate(child, node, level + 1) cursor.children.append(node) cursor.attributes = element.attrib cursor.content = element.text or element.tail or '' return cursor root = ForgivingParse(data).getroot() - rt = translate(root, XMLNode(root.tag, None)) + rt = translate(root, XMLNode(root.tag)) rt.source = root + rt.reindex() return rt diff --git a/refinery/units/formats/html.py b/refinery/units/formats/html.py index 7268bf55dc..e7247c4c0b 100644 --- a/refinery/units/formats/html.py +++ b/refinery/units/formats/html.py @@ -6,9 +6,7 @@ from refinery.lib.meta import metavars from refinery.units.formats import XMLToPathExtractorUnit, UnpackResult, Arg -import io - -from collections import Counter +from io import StringIO from html.parser import HTMLParser _HTML_DATA_ROOT_TAG = 'html' @@ -27,7 +25,7 @@ def root(self) -> bool: return self.tag == _HTML_DATA_ROOT_TAG def recover(self, inner=True) -> str: - with io.StringIO() as stream: + with StringIO() as stream: if not inner: stream.write(self.content) for child in self.children: @@ -64,7 +62,7 @@ def __init__(self) -> None: def handle_starttag(self, tag: str, attributes): if tag in self._SELF_CLOSING_TAGS: return - node = HTMLNode(tag, self.tos, self.get_starttag_text(), attributes={ + node = HTMLNode(tag, None, self.tos, self.get_starttag_text(), attributes={ key: value for key, value in attributes if key and value}) children = self.tos.children previous = children[-1] if children else None @@ -92,7 +90,7 @@ def handle_entityref(self, name: str) -> None: if last.textual: last.content += ntt return - self.tos.children.append(HTMLNode(None, self.tos, ntt)) + self.tos.children.append(HTMLNode(None, None, self.tos, ntt)) def handle_charref(self, name: str) -> None: self.handle_entityref(F'#{name}') @@ -113,7 +111,7 @@ def handle_endtag(self, tag: str): self.tos = cursor.parent def handle_data(self, data): - self.tos.children.append(HTMLNode(None, self.tos, data)) + self.tos.children.append(HTMLNode(None, None, self.tos, data)) class xthtml(XMLToPathExtractorUnit): @@ -133,6 +131,8 @@ def unpack(self, data): html = HTMLTreeParser() html.feed(data.decode(self.codec)) root = html.tos + root.reindex() + meta = metavars(data) path = self._make_path_builder(meta, root) @@ -140,8 +140,11 @@ def unpack(self, data): self.log_info(F'tag was not closed: {root.tag}') root = root.parent - while len(root.children) == 1 and root.children[0].tag == root.tag: - root, = root.children + while len(root.children) == 1: + child, = root.children + if child.tag != root.tag: + break + root = child def tree(root: HTMLNode, *parts: str): @@ -164,22 +167,10 @@ def inner(root: HTMLNode = root): else: yield UnpackResult(tagpath, inner, **meta) - tag_pre_count = Counter() - tag_run_count = Counter() for child in root.children: if child.textual: continue - tag_pre_count[child.tag] += 1 - - for child in root.children: - if child.textual: - continue - if tag_pre_count[child.tag] == 1: - yield from tree(child, *parts, path(child)) - continue - tag_run_count[child.tag] += 1 - index = tag_run_count[child.tag] - yield from tree(child, *parts, path(child, index)) + yield from tree(child, *parts, path(child)) yield from tree(root, path(root)) diff --git a/refinery/units/formats/xml.py b/refinery/units/formats/xml.py index fa1af0cfad..4a7cbcdfdf 100644 --- a/refinery/units/formats/xml.py +++ b/refinery/units/formats/xml.py @@ -1,7 +1,5 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- -from collections import Counter - from refinery.lib.structures import MemoryFile from refinery.lib.meta import metavars from refinery.lib import xml