diff --git a/vespa/repository/api/ingestion/config.py b/vespa/repository/api/ingestion/config.py index 56e5b9a2..0f6dec6e 100644 --- a/vespa/repository/api/ingestion/config.py +++ b/vespa/repository/api/ingestion/config.py @@ -107,6 +107,9 @@ { 'url': 'https://planet.openstreetmap.org/planet/planet-latest.osm.bz2', 'file_type': 'xml', + 'filters': [ + lambda node: 'place' in node, # Filter to only include nodes tagged as places + ] } ], }, diff --git a/vespa/repository/api/ingestion/processor.py b/vespa/repository/api/ingestion/processor.py index 51ed07c5..f2e252d0 100644 --- a/vespa/repository/api/ingestion/processor.py +++ b/vespa/repository/api/ingestion/processor.py @@ -305,6 +305,11 @@ async def process_batch(batch): # Process the batch when it reaches the batch_size or limit if len(current_batch) >= batch_size or (limit is not None and count >= limit): + + # Temporarily skip processing TODO: REMOVE + logger.info(document) + continue + batch_results = await process_batch(current_batch) results.extend(batch_results) # Collect results current_batch = [] diff --git a/vespa/repository/api/ingestion/streamer.py b/vespa/repository/api/ingestion/streamer.py index 323f88df..dbd27cf7 100644 --- a/vespa/repository/api/ingestion/streamer.py +++ b/vespa/repository/api/ingestion/streamer.py @@ -178,8 +178,18 @@ async def async_generator(): def _parse_xml_stream(self, stream): # Parse XML incrementally from stream for event, elem in xml.etree.ElementTree.iterparse(stream, events=('end',)): - if elem.tag == 'place': # Assuming the root element of interest is - yield elem + if elem.tag == 'node': + # Create a dictionary from element attributes + elem_data = dict(elem.attrib) + + # Add child tag attributes (k, v) to the dictionary + for tag in elem.findall('tag'): + key = tag.attrib.get('k') + value = tag.attrib.get('v') + if key and value: + elem_data[key] = value + + yield elem_data elem.clear() # Free memory def _split_triple(self, line): diff --git a/vespa/repository/api/ingestion/subtransformers/geonames/names.py b/vespa/repository/api/ingestion/subtransformers/geonames/names.py index d00b70af..306818c5 100644 --- a/vespa/repository/api/ingestion/subtransformers/geonames/names.py +++ b/vespa/repository/api/ingestion/subtransformers/geonames/names.py @@ -55,8 +55,10 @@ def process(self) -> dict: return self.output years = { - **({'year_start': year_start} if (year_start := self.name.get('from')) else {}), - **({'year_end': year_end} if (year_end := self.name.get('to')) else {}), + # **({'year_start': year_start} if (year_start := self.name.get('from')) else {}), + # **({'year_end': year_end} if (year_end := self.name.get('to')) else {}), + 'year_start': self.name.get('from', 2025), + 'year_end': self.name.get('to', 2025), } self.output['names'].append({