Skip to content

Commit

Permalink
feat(profiling): opensearch auto detecting index and field
Browse files Browse the repository at this point in the history
  • Loading branch information
WeryZebra-Yue committed Aug 16, 2023
1 parent 74c26b3 commit fbbf4db
Show file tree
Hide file tree
Showing 2 changed files with 60 additions and 1 deletion.
43 changes: 42 additions & 1 deletion datachecks/core/datasource/search_datasource.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
# limitations under the License.

from datetime import datetime
from typing import Dict
from typing import Dict, List

from dateutil import parser

Expand All @@ -25,11 +25,52 @@ class SearchIndexDataSource(DataSource):
Abstract class for search index data sources
"""

FIELD_TYPE_MAPPING = {
"text": str,
"keyword": str,
"date": datetime,
"long": int,
"integer": int,
"short": int,
"byte": int,
"double": float,
"float": float,
"half_float": float,
"boolean": bool,
"binary": str,
"nested": dict,
}

def __init__(self, data_source_name: str, data_connection: Dict):
super().__init__(data_source_name, data_connection)

self.client = None

def query_get_index_metadata(self) -> List[str]:
"""
Get the index metadata
:return: query for index metadata
"""
return [index for index in self.client.indices.get_alias("*")]

def query_get_field_metadata(self, index_name: str) -> Dict[str, str]:
"""
Get the field metadata
:param index_name: name of the index
:return: query for field metadata
"""
results_: Dict[str, str] = {}
mappings = self.client.indices.get_mapping(index=index_name)
properties = mappings[index_name]["mappings"]["properties"]

for field, value in properties.items():
if "type" in value:
results_[field] = self.FIELD_TYPE_MAPPING[value["type"]]
elif "properties" in value:
results_[field] = self.FIELD_TYPE_MAPPING["nested"]

return results_

def query_get_document_count(self, index_name: str, filters: Dict = None) -> int:
"""
Get the document count
Expand Down
18 changes: 18 additions & 0 deletions tests/integration/datasource/test_search_datasource.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,3 +134,21 @@ def test_should_calculate_time_diff_in_second(
):
diff = opensearch_datasource.query_get_time_diff(INDEX_NAME, "last_fight")
assert diff >= 24 * 3600 * 3

def test_index_field_metadata(
self, opensearch_datasource: OpenSearchSearchIndexDataSource
):
index_field_metadata = opensearch_datasource.query_get_field_metadata(
INDEX_NAME
)

assert index_field_metadata["name"] == str
assert index_field_metadata["age"] == int
assert index_field_metadata["last_fight"] == datetime.datetime

def test_index_metadata(
self, opensearch_datasource: OpenSearchSearchIndexDataSource
):
indices = opensearch_datasource.query_get_index_metadata()

assert INDEX_NAME in indices

0 comments on commit fbbf4db

Please sign in to comment.