Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add JS module detection rewriting #810

Open
wants to merge 4 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 15 additions & 7 deletions pywb/rewrite/html_rewriter.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,7 @@ def __init__(self, url_rewriter,
super(HTMLRewriterMixin, self).__init__(url_rewriter, False)
self.charset = charset
self._wb_parse_context = None
self._wb_parse_module = False

if js_rewriter:
self.js_rewriter = js_rewriter
Expand Down Expand Up @@ -308,7 +309,8 @@ def _rewrite_script(self, script_content, inline_attr=False):
return ''

content = self.js_rewriter.rewrite_complete(script_content,
inline_attr=inline_attr)
inline_attr=inline_attr,
is_module=self._wb_parse_module)
if inline_attr:
content = self.ADD_WINDOW.sub('window.\\1', content)

Expand Down Expand Up @@ -433,7 +435,7 @@ def _rewrite_tag_attrs(self, tag, tag_attrs, set_parsing_context=True):
# URL not skipped, likely src='js/....', forcing abs to make sure, cause PHP MIME(JS) === HTML
attr_value = self._rewrite_url(attr_value, rw_mod, True)
self._write_attr('__wb_orig_src', ov, empty_attr=None)

elif attr_name == 'target':
target = attr_value
if target in ('_blank', '_parent', '_top'):
Expand Down Expand Up @@ -484,24 +486,30 @@ def _set_parse_context(self, tag, tag_attrs):
self._wb_parse_context = 'style'

elif tag == 'script':
if self._allow_js_type(tag_attrs):
result = self._allow_js_type(tag_attrs)
if result:
self._wb_parse_context = 'script'
self._wb_parse_module = (result == 'script-module')


def _allow_js_type(self, tag_attrs):
type_value = self.get_attr(tag_attrs, 'type')

if not type_value:
return True
return 'script'

type_value = type_value.lower()

if 'javascript' in type_value:
return True
return 'script'

if 'ecmascript' in type_value:
return True
return 'script'

return False
if type_value == 'module':
return 'script-module'

return None

def _rewrite_head(self, start_end):
# special case: head tag
Expand Down
47 changes: 47 additions & 0 deletions pywb/rewrite/regex_rewriters.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import re
from pywb.rewrite.content_rewriter import StreamingRewriter
from pywb.utils.loaders import load_py_name
from pywb.utils.io import BUFF_SIZE
from six.moves.urllib.parse import unquote


Expand Down Expand Up @@ -276,12 +277,58 @@ class JSWombatProxyRewriter(RegexRewriter):
def __init__(self, rewriter, extra_rules=None):
super(JSWombatProxyRewriter, self).__init__(rewriter, extra_rules=extra_rules)

self.rewriter = rewriter
self.extra_rules = extra_rules

self.first_buff = self.rules_factory.first_buff
self.last_buff = self.rules_factory.last_buff
self.local_objs = self.rules_factory.local_objs

self._is_module_check = None

def set_as_module(self):
self.first_buff = "\nimport {{ {0} }} from '/static/__wb_module_decl.js';\n".format(
", ".join(obj for obj in self.local_objs)
)
self.last_buff = ""
self._is_module_check = True

def __call__(self, rwinfo):
if self._is_module_check == None:
buf = rwinfo.read_and_keep(BUFF_SIZE * 4)

if self.is_module(buf):
self.set_as_module()
else:
self._is_module_check = False

return super(JSWombatProxyRewriter, self).__call__(rwinfo)

@staticmethod
def is_module(string):
"""Return boolean indicating whether import or export statement is found."""
IMPORT_REGEX = re.compile(br"^\s*?import\s*?[{\"']")
EXPORT_REGEX = re.compile(br"^\s*?export\s*?({([\s\w,$\n]+?)}[\s;]*|default|class)\s+", re.M)

if not string:
return False

if isinstance(string, str):
string = string.encode("utf-8")

if b"import" in string and re.search(IMPORT_REGEX, string):
return True

if b"export" in string and re.search(EXPORT_REGEX, string):
return True

return False

def rewrite_complete(self, string, **kwargs):
if not kwargs.get('inline_attr'):
if kwargs.get('is_module'):
self.set_as_module()

return super(JSWombatProxyRewriter, self).rewrite_complete(string)

# check if any of the wrapped objects are used in the script
Expand Down
19 changes: 19 additions & 0 deletions pywb/rewrite/test/test_regex_rewriters.py
Original file line number Diff line number Diff line change
Expand Up @@ -348,6 +348,7 @@
from pywb.rewrite.regex_rewriters import RegexRewriter, JSRewriter, CSSRewriter, XMLRewriter, RxRules
from pywb.rewrite.regex_rewriters import JSWombatProxyRewriter

import pytest

urlrewriter = UrlRewriter('20131010/http://example.com/', '/web/', 'https://localhost/web/')

Expand All @@ -367,6 +368,24 @@ def _test_xml(string):
def _test_css(string):
return CSSRewriter(urlrewriter).rewrite(string)

@pytest.mark.parametrize(
"string, expected_return",
[
# imports
("import './a-module.js'\n", True),
# exports
("export { name1 };\n", True),
("export default function functionName() { /* … */ }", True),
("export class ClassName { /* … */ };", True),
# not a module
("let counter = 0;\nconsole.log(counter);", False),
("", False),
(None, False)
]
)
def test_is_module(string, expected_return):
assert JSWombatProxyRewriter.is_module(string) == expected_return

if __name__ == "__main__":
import doctest
doctest.testmod()
Expand Down
12 changes: 12 additions & 0 deletions pywb/static/__wb_module_decl.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
var wrapObj = function(name) {return (self._wb_wombat && self._wb_wombat.local_init && self._wb_wombat.local_init(name)) || self[name]; };
if (!self.__WB_pmw) { self.__WB_pmw = function(obj) { this.__WB_source = obj; return this; } }
const window = wrapObj("window");
const document = wrapObj("document");
const location = wrapObj("location");
const top = wrapObj("top");
const parent = wrapObj("parent");
const frames = wrapObj("frames");
const opener = wrapObj("opener");
const __self = wrapObj("self");
const __globalThis = wrapObj("globalThis");
export { window, document, location, top, parent, frames, opener, __self as self, __globalThis as globalThis };
4 changes: 2 additions & 2 deletions pywb/static/wombat.js

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion pywb/static/wombatProxyMode.js

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion pywb/static/wombatWorkers.js

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.