diff --git a/README.rst b/README.rst index 87b66902..9fe11e53 100644 --- a/README.rst +++ b/README.rst @@ -60,9 +60,7 @@ Installation for Deployment To install pywb for usage, you can use: -```shell -pip install pywb -``` +``pip install pywb`` Note: depending on your Python installation, you may have to use `pip3` instead of `pip`. @@ -70,9 +68,7 @@ Note: depending on your Python installation, you may have to use `pip3` instead Installation from local copy ---------------------------- -```shell -git clone https://github.com/webrecorder/pywb -``` +``git clone https://github.com/webrecorder/pywb`` To install from a locally cloned copy, install with ``pip install -e .`` or ``python setup.py install``. diff --git a/pywb/apps/frontendapp.py b/pywb/apps/frontendapp.py index f3367e05..42b3e76e 100644 --- a/pywb/apps/frontendapp.py +++ b/pywb/apps/frontendapp.py @@ -667,10 +667,12 @@ def handle_request(self, environ, start_response): # store original script_name (original prefix) before modifications are made environ['ORIG_SCRIPT_NAME'] = environ.get('SCRIPT_NAME') - lang = args.pop('lang', self.default_locale) + lang = args.pop('lang', '') if lang: shift_path_info(environ) environ['pywb_lang'] = lang + elif self.default_locale: + environ['pywb_lang'] = self.default_locale response = endpoint(environ, **args) diff --git a/pywb/manager/manager.py b/pywb/manager/manager.py index 40a8bef8..d983c726 100644 --- a/pywb/manager/manager.py +++ b/pywb/manager/manager.py @@ -12,7 +12,7 @@ from pkg_resources import resource_string, get_distribution from argparse import ArgumentParser, RawTextHelpFormatter -from tempfile import mkdtemp +from tempfile import mkdtemp, TemporaryDirectory from zipfile import ZipFile from pywb.utils.loaders import load_yaml_config @@ -213,35 +213,35 @@ def _add_wacz_uncompressed(self, wacz): # delete temporary files shutil.rmtree(temp_dir) - @staticmethod - def _add_wacz_index(collection_index_path, wacz_index_path, filename_mapping): + def _add_wacz_index(self, collection_index_path, wacz_index_path, filename_mapping): from pywb.warcserver.index.cdxobject import CDXObject - # copy collection index to temporary directory - tempdir = mkdtemp() - collection_index_name = os.path.basename(collection_index_path) - collection_index_temp_path = os.path.join(tempdir, collection_index_name) - - if os.path.exists(collection_index_path): - shutil.copy2(collection_index_path, collection_index_temp_path) + # rewrite wacz index to temporary index file + tempdir = TemporaryDirectory() + wacz_index_name = os.path.basename(wacz_index_path) + rewritten_index_path = os.path.join(tempdir.name, wacz_index_name) - with open(collection_index_temp_path, 'a') as collection_index_temp_file: + with open(rewritten_index_path, 'w') as rewritten_index: if wacz_index_path.endswith('.gz'): - wacz_index_file = gzip.open(wacz_index_path, 'rb') + wacz_index = gzip.open(wacz_index_path, 'rb') else: - wacz_index_file = open(wacz_index_path, 'rb') - collection_index_temp_file.write('\n') - for line in wacz_index_file.readlines(): + wacz_index = open(wacz_index_path, 'rb') + + for line in wacz_index: cdx_object = CDXObject(cdxline=line) if cdx_object['filename'] in filename_mapping: cdx_object['filename'] = filename_mapping[cdx_object['filename']] - collection_index_temp_file.write(cdx_object.to_cdxj()) + rewritten_index.write(cdx_object.to_cdxj()) + + if not os.path.isfile(collection_index_path): + shutil.move(rewritten_index_path, collection_index_path) + return - wacz_index_file.close() + temp_coll_index_path = collection_index_path + '.tmp.' + timestamp20_now() + self._merge_indices(collection_index_path, rewritten_index_path, temp_coll_index_path) + shutil.move(temp_coll_index_path, collection_index_path) - # copy temporary index back to original location and delete temporary directory - shutil.move(collection_index_temp_path, collection_index_path) - shutil.rmtree(tempdir) + tempdir.cleanup() def reindex(self): cdx_file = os.path.join(self.indexes_dir, self.DEF_INDEX_FILE) @@ -294,20 +294,24 @@ def _index_merge_warcs(self, new_warcs, index_file, rel_root=None): merged_file = temp_file + '.merged' - last_line = None - - with open(cdx_file, 'rb') as orig_index: - with open(temp_file, 'rb') as new_index: - with open(merged_file, 'w+b') as merged: - for line in heapq.merge(orig_index, new_index): - if last_line != line: - merged.write(line) - last_line = line + self._merge_indices(cdx_file, temp_file, merged_file) shutil.move(merged_file, cdx_file) #os.rename(merged_file, cdx_file) os.remove(temp_file) + @staticmethod + def _merge_indices(index1, index2, dest): + last_line = None + + with open(index1, 'rb') as index1_f: + with open(index2, 'rb') as index2_f: + with open(dest, 'wb') as dest_f: + for line in heapq.merge(index1_f, index2_f): + if last_line != line: + dest_f.write(line) + last_line = line + def set_metadata(self, namevalue_pairs): metadata_yaml = os.path.join(self.curr_coll_dir, 'metadata.yaml') metadata = None diff --git a/pywb/rewrite/templateview.py b/pywb/rewrite/templateview.py index 39c76ee7..c323a999 100644 --- a/pywb/rewrite/templateview.py +++ b/pywb/rewrite/templateview.py @@ -178,7 +178,7 @@ def switch_locale(context, locale): request_uri = environ.get('REQUEST_URI', environ.get('PATH_INFO')) - if curr_loc: + if curr_loc and request_uri.startswith('/' + curr_loc + '/'): return request_uri.replace(curr_loc, locale, 1) app_prefix = environ.get('pywb.app_prefix', '') @@ -196,11 +196,11 @@ def get_locale_prefixes(context): orig_prefix = environ.get('pywb.app_prefix', '') coll = environ.get('SCRIPT_NAME', '') - if orig_prefix: + if orig_prefix and coll.startswith(orig_prefix): coll = coll[len(orig_prefix):] curr_loc = environ.get('pywb_lang', '') - if curr_loc: + if curr_loc and coll.startswith('/' + curr_loc): coll = coll[len(curr_loc) + 1:] for locale in loc_map.keys(): diff --git a/pywb/templates/error.html b/pywb/templates/error.html index 2cf9a276..ba15dbd5 100644 --- a/pywb/templates/error.html +++ b/pywb/templates/error.html @@ -3,7 +3,7 @@ {% block body %}
-

Pywb Error

+

{{ _('Pywb Error') }}

diff --git a/tests/test_manager.py b/tests/test_manager.py index 285e64f8..c960674b 100644 --- a/tests/test_manager.py +++ b/tests/test_manager.py @@ -75,10 +75,15 @@ def test_merge_wacz_index(self, tmp_path): {'example.warc.gz': 'rewritten.warc.gz'}) with open(os.path.join(manager.indexes_dir, manager.DEF_INDEX_FILE), 'r') as f: index_content = f.read() + index_content = index_content.strip() assert 'example.warc.gz' not in index_content assert 'rewritten.warc.gz' in index_content + # check that collection index is sorted + index_lines = index_content.split('\n') + assert sorted(index_lines) == index_lines + def test_merge_wacz_index_gzip(self, tmp_path): manager = self.get_test_collections_manager(tmp_path) manager._add_wacz_index(os.path.join(manager.indexes_dir, manager.DEF_INDEX_FILE),