From 1d0a0a8eb5a4eb0c9ba3415f21180333ccb11e18 Mon Sep 17 00:00:00 2001 From: Alexey Rubaseff Date: Sun, 24 Mar 2024 11:56:15 +0200 Subject: [PATCH] ! chunked all items recursive parsing --- rozetka/entities/supercategory.py | 15 +++++++++++---- rozetka/runners/parse_api.py | 30 +++++++++++++++++------------- 2 files changed, 28 insertions(+), 17 deletions(-) diff --git a/rozetka/entities/supercategory.py b/rozetka/entities/supercategory.py index 05cb43e..51ce8d7 100644 --- a/rozetka/entities/supercategory.py +++ b/rozetka/entities/supercategory.py @@ -69,10 +69,7 @@ def get_fat_menu_categories(): return output -def get_all_items_recursively(loop=False) -> List[Item]: - if loop: - return list() - +def get_all_item_ids_recursively(): _ = get_super_category_ids() categories = list(get_all_categories_recursively()) all_categories = list(set(categories)) @@ -85,6 +82,16 @@ def get_all_items_recursively(loop=False) -> List[Item]: items_ids = tools.fncs_map((_._get_item_ids for _ in all_categories)) or [] items_ids = list(set(chain(*items_ids))) LOG.green(f"Got {len(items_ids)} item ids from {all_categories_len} categories") + return items_ids, all_categories_len + + +def get_all_items_recursively(loop=False, items_ids=None, all_categories_len=None) -> List[Item]: + if loop: + return list() + + if items_ids is None: + items_ids, all_categories_len = get_all_item_ids_recursively() + items = Item.parse_multiple(*items_ids, parse_subitems=False) LOG.green(f"Got {len(items)} items from {all_categories_len} categories") diff --git a/rozetka/runners/parse_api.py b/rozetka/runners/parse_api.py index cbb44f4..9b29cd1 100644 --- a/rozetka/runners/parse_api.py +++ b/rozetka/runners/parse_api.py @@ -10,7 +10,7 @@ from rozetka.entities.item import Item from rozetka.entities.point import Point -from rozetka.entities.supercategory import get_all_items_recursively +from rozetka.entities.supercategory import get_all_items_recursively, get_all_item_ids_recursively from rozetka.tools import db, constants, tools LOG = Log.get_logger() @@ -61,19 +61,23 @@ def _main(): start = pendulum.now() LOG.verbose = constants.VERBOSE - all_items = get_all_items_recursively() - - LOG.green(f"Building points for {len(all_items)} items") - points = list(map(build_item_point, all_items)) - LOG.green(f"Dumping {len(points)} points") - # https://docs.influxdata.com/influxdb/v2.4/write-data/best-practices/optimize-writes/ - chunked_points = tools.slice_list(points, 5000) - for chunked_points_item in Bar(f"Dumping {len(chunked_points)} point chunks").iter(chunked_points): - asyncio.run(db.dump_points_async(record=chunked_points_item)) - + all_item_ids, _ = get_all_item_ids_recursively() + chunked_items_ids = tools.slice_list(all_item_ids, 10000) + overal_length = 0 + for chunked_item_ids in Bar(f"Dumping {len(chunked_items_ids)} point chunks").iter(chunked_items_ids): + all_items = get_all_items_recursively(chunked_item_ids) + LOG.green(f"Building points for {len(all_items)} items") + points = list(map(build_item_point, all_items)) + LOG.green(f"Dumping {len(points)} points") + # https://docs.influxdata.com/influxdb/v2.4/write-data/best-practices/optimize-writes/ + chunked_points = tools.slice_list(points, 5000) + for chunked_points_item in Bar(f"Dumping {len(chunked_points)} point chunks").iter(chunked_points): + asyncio.run(db.dump_points_async(record=chunked_points_item)) + + overal_length += len(points) duration = pendulum.now().diff_for_humans(start) - LOG.green(f"Duration: {duration}") - return len(points) + LOG.green(f"Points: {overal_length}, Duration: {duration}") + return overal_length def main():