diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml index 70c8c41..5e5f60d 100644 --- a/.github/workflows/python.yml +++ b/.github/workflows/python.yml @@ -149,9 +149,9 @@ jobs: strategy: matrix: platform: - - runner: macos-12 + - runner: macos-13 target: x86_64 - - runner: macos-14 + - runner: macos-15 target: aarch64 steps: - uses: actions/checkout@v4 diff --git a/popgetter/src/data_request_spec.rs b/popgetter/src/data_request_spec.rs index 0d859d8..db2c1f2 100644 --- a/popgetter/src/data_request_spec.rs +++ b/popgetter/src/data_request_spec.rs @@ -4,8 +4,8 @@ use serde::{Deserialize, Serialize}; use crate::geo::BBox; use crate::search::{ - DownloadParams, GeometryLevel, MetricId, Params, SearchContext, SearchParams, SearchText, - YearRange, + CaseSensitivity, DownloadParams, GeometryLevel, MatchType, MetricId, Params, SearchConfig, + SearchContext, SearchParams, SearchText, YearRange, }; #[derive(Serialize, Deserialize, Clone, Debug, Default)] @@ -36,6 +36,10 @@ impl TryFrom for Params { SearchContext::Hxl, SearchContext::Description ], + config: SearchConfig { + match_type: MatchType::Regex, + case_sensitivity: CaseSensitivity::Insensitive, + }, }), _ => None, }) @@ -57,11 +61,20 @@ impl TryFrom for Params { _ => None, }) .collect_vec(), - geometry_level: value - .geometry - .as_ref() - .and_then(|geometry| geometry.geometry_level.to_owned().map(GeometryLevel)), + geometry_level: value.geometry.as_ref().and_then(|geometry| { + geometry + .geometry_level + .to_owned() + .map(|geometry_level| GeometryLevel { + value: geometry_level, + config: SearchConfig { + match_type: MatchType::Exact, + case_sensitivity: CaseSensitivity::Insensitive, + }, + }) + }), source_data_release: None, + source_download_url: None, data_publisher: None, country: None, source_metric_id: None, diff --git a/popgetter/src/search.rs b/popgetter/src/search.rs index 8f8e228..37bedd1 100644 --- a/popgetter/src/search.rs +++ b/popgetter/src/search.rs @@ -10,7 +10,7 @@ use crate::{ }; use anyhow::bail; use chrono::NaiveDate; -use log::{debug, warn}; +use log::{debug, error, warn}; use nonempty::{nonempty, NonEmpty}; use polars::lazy::dsl::{col, lit, Expr}; use polars::prelude::{DataFrame, DataFrameJoinOps, IntoLazy, LazyFrame}; @@ -68,15 +68,40 @@ fn _combine_exprs_with_and1(exprs: NonEmpty) -> Expr { /// Search in a column case-insensitively for a string literal (i.e. not a regex!). The search /// parameter can appear anywhere in the column value. -fn case_insensitive_contains(column: &str, value: &str) -> Expr { - let regex = format!("(?i){}", regex::escape(value)); +fn filter_contains(column: &str, value: &str, case_sensitivity: &CaseSensitivity) -> Expr { + let regex = match case_sensitivity { + CaseSensitivity::Insensitive => format!("(?i){}", regex::escape(value)), + CaseSensitivity::Sensitive => regex::escape(value).to_string(), + }; + col(column).str().contains(lit(regex), false) +} + +/// Search in a column for a string literal (i.e. not a regex!). The search parameter must be a +/// prefix of the column value. +fn filter_startswith(column: &str, value: &str, case_sensitivity: &CaseSensitivity) -> Expr { + let regex = match case_sensitivity { + CaseSensitivity::Insensitive => format!("(?i)^{}", regex::escape(value)), + CaseSensitivity::Sensitive => format!("^{}", regex::escape(value)), + }; col(column).str().contains(lit(regex), false) } /// Search in a column case-insensitively for a string literal (i.e. not a regex!). The search /// parameter must be a prefix of the column value. -fn case_insensitive_startswith(column: &str, value: &str) -> Expr { - let regex = format!("(?i)^{}", regex::escape(value)); +fn filter_exact(column: &str, value: &str, case_sensitivity: &CaseSensitivity) -> Expr { + let regex = match case_sensitivity { + CaseSensitivity::Insensitive => format!("(?i)^{}$", regex::escape(value)), + CaseSensitivity::Sensitive => format!("^{}$", regex::escape(value)), + }; + col(column).str().contains(lit(regex), false) +} + +/// Search in a column for a regex (case insensitively) +fn filter_regex(column: &str, value: &str, case_sensitivity: &CaseSensitivity) -> Expr { + let regex = match case_sensitivity { + CaseSensitivity::Insensitive => format!("(?i){}", value), + CaseSensitivity::Sensitive => value.to_string(), + }; col(column).str().contains(lit(regex), false) } @@ -95,20 +120,43 @@ impl SearchContext { } } +// TODO: can this be written with From<&MatchType> for impl Fn(&str, &str, &CaseSensitivity) -> Expr +fn get_filter_fn(match_type: &MatchType) -> impl Fn(&str, &str, &CaseSensitivity) -> Expr { + match match_type { + MatchType::Regex => filter_regex, + MatchType::Exact => filter_exact, + MatchType::Contains => filter_contains, + MatchType::Startswith => filter_startswith, + } +} + +fn get_queries_for_search_text Expr>( + filter_fn: F, + val: SearchText, +) -> Expr { + let queries: NonEmpty = val.context.map(|field| match field { + SearchContext::Hxl => { + filter_fn(COL::METRIC_HXL_TAG, &val.text, &val.config.case_sensitivity) + } + SearchContext::HumanReadableName => filter_fn( + COL::METRIC_HUMAN_READABLE_NAME, + &val.text, + &val.config.case_sensitivity, + ), + SearchContext::Description => filter_fn( + COL::METRIC_DESCRIPTION, + &val.text, + &val.config.case_sensitivity, + ), + }); + combine_exprs_with_or1(queries) +} + /// Implementing conversion from `SearchText` to a polars expression enables a /// `SearchText` to be passed to polars dataframe for filtering results. impl From for Expr { fn from(val: SearchText) -> Self { - let queries: NonEmpty = val.context.map(|field| match field { - SearchContext::Hxl => case_insensitive_contains(COL::METRIC_HXL_TAG, &val.text), - SearchContext::HumanReadableName => { - case_insensitive_contains(COL::METRIC_HUMAN_READABLE_NAME, &val.text) - } - SearchContext::Description => { - case_insensitive_contains(COL::METRIC_DESCRIPTION, &val.text) - } - }); - combine_exprs_with_or1(queries) + get_queries_for_search_text(get_filter_fn(&val.config.match_type), val) } } @@ -144,48 +192,95 @@ impl From for Expr { impl From for Expr { fn from(value: DataPublisher) -> Self { - case_insensitive_contains(COL::DATA_PUBLISHER_NAME, &value.0) + get_filter_fn(&value.config.match_type)( + COL::DATA_PUBLISHER_NAME, + &value.value, + &value.config.case_sensitivity, + ) + } +} + +impl From for Expr { + fn from(value: SourceDownloadUrl) -> Self { + get_filter_fn(&value.config.match_type)( + COL::METRIC_SOURCE_DOWNLOAD_URL, + &value.value, + &value.config.case_sensitivity, + ) } } impl From for Expr { fn from(value: SourceDataRelease) -> Self { - case_insensitive_contains(COL::SOURCE_DATA_RELEASE_NAME, &value.0) + get_filter_fn(&value.config.match_type)( + COL::SOURCE_DATA_RELEASE_NAME, + &value.value, + &value.config.case_sensitivity, + ) } } impl From for Expr { fn from(value: GeometryLevel) -> Self { - case_insensitive_contains(COL::GEOMETRY_LEVEL, &value.0) + get_filter_fn(&value.config.match_type)( + COL::GEOMETRY_LEVEL, + &value.value, + &value.config.case_sensitivity, + ) } } +fn combine_country_fn Expr>(func: F, value: &str) -> Expr { + // Assumes case insensitive + combine_exprs_with_or(vec![ + func( + COL::COUNTRY_NAME_SHORT_EN, + value, + &CaseSensitivity::Insensitive, + ), + func( + COL::COUNTRY_NAME_OFFICIAL, + value, + &CaseSensitivity::Insensitive, + ), + func(COL::COUNTRY_ISO2, value, &CaseSensitivity::Insensitive), + func(COL::COUNTRY_ISO3, value, &CaseSensitivity::Insensitive), + func(COL::COUNTRY_ISO3166_2, value, &CaseSensitivity::Insensitive), + // TODO: add `COUNTRY_ID` for ExpandedMetadata + // func(COL::COUNTRY_ID, &value, &CaseSensitivity::Insensitive), + func( + COL::DATA_PUBLISHER_COUNTRIES_OF_INTEREST, + value, + &CaseSensitivity::Insensitive, + ), + ]) + // Unwrap: cannot be None as vec above is non-empty + .unwrap() +} + impl From for Expr { fn from(value: Country) -> Self { - combine_exprs_with_or(vec![ - case_insensitive_contains(COL::COUNTRY_NAME_SHORT_EN, &value.0), - case_insensitive_contains(COL::COUNTRY_NAME_OFFICIAL, &value.0), - case_insensitive_contains(COL::COUNTRY_ISO2, &value.0), - case_insensitive_contains(COL::COUNTRY_ISO3, &value.0), - case_insensitive_contains(COL::COUNTRY_ISO3166_2, &value.0), - // TODO: add `COUNTRY_ID` for ExpandedMetadata - // case_insensitive_contains(COL::COUNTRY_ID, &value.0), - case_insensitive_contains(COL::DATA_PUBLISHER_COUNTRIES_OF_INTEREST, &value.0), - ]) - // Unwrap: cannot be None as vec above is non-empty - .unwrap() + combine_country_fn(get_filter_fn(&value.config.match_type), &value.value) } } impl From for Expr { fn from(value: SourceMetricId) -> Self { - case_insensitive_contains(COL::METRIC_SOURCE_METRIC_ID, &value.0) + get_filter_fn(&value.config.match_type)( + COL::METRIC_SOURCE_METRIC_ID, + &value.value, + &value.config.case_sensitivity, + ) } } impl From for Expr { fn from(value: MetricId) -> Self { - case_insensitive_startswith(COL::METRIC_ID, &value.0) + get_filter_fn(&value.config.match_type)( + COL::METRIC_ID, + &value.id, + &value.config.case_sensitivity, + ) } } @@ -193,13 +288,19 @@ impl From for Expr { pub struct SearchText { pub text: String, pub context: NonEmpty, + pub config: SearchConfig, } impl Default for SearchText { fn default() -> Self { + // TODO: check that this functions ok where default is currently used for SearchText Self { text: "".to_string(), context: SearchContext::all(), + config: SearchConfig { + match_type: MatchType::Exact, + case_sensitivity: CaseSensitivity::Insensitive, + }, } } } @@ -247,27 +348,84 @@ impl FromStr for YearRange { /// Search over metric IDs #[derive(Clone, Debug, Deserialize, Serialize)] -pub struct MetricId(pub String); +pub struct MetricId { + pub id: String, + #[serde(default = "default_metric_id_search_config")] + pub config: SearchConfig, +} + +fn default_metric_id_search_config() -> SearchConfig { + SearchConfig { + match_type: MatchType::Startswith, + case_sensitivity: CaseSensitivity::Insensitive, + } +} + +#[derive(Debug, Clone, Copy, Serialize, Deserialize, Default)] +pub enum MatchType { + Regex, + #[default] + Exact, + Contains, + Startswith, +} + +#[derive(Debug, Clone, Copy, Serialize, Deserialize, Default)] +pub enum CaseSensitivity { + #[default] + Insensitive, + Sensitive, +} + +/// Configuration for searching. +#[derive(Clone, Debug, Serialize, Deserialize)] +pub struct SearchConfig { + /// Whether string matching is exact or uses regex. + pub match_type: MatchType, + /// Whether matching is case sensitive or insensitive. + pub case_sensitivity: CaseSensitivity, +} /// Search over geometry levels #[derive(Clone, Debug, Deserialize, Serialize)] -pub struct GeometryLevel(pub String); +pub struct GeometryLevel { + pub value: String, + pub config: SearchConfig, +} /// Search over source data release names #[derive(Clone, Debug, Deserialize, Serialize)] -pub struct SourceDataRelease(pub String); +pub struct SourceDataRelease { + pub value: String, + pub config: SearchConfig, +} + +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct SourceDownloadUrl { + pub value: String, + pub config: SearchConfig, +} /// Search over data publisher names #[derive(Clone, Debug, Deserialize, Serialize)] -pub struct DataPublisher(pub String); +pub struct DataPublisher { + pub value: String, + pub config: SearchConfig, +} /// Search over country (short English names) #[derive(Clone, Debug, Deserialize, Serialize)] -pub struct Country(pub String); +pub struct Country { + pub value: String, + pub config: SearchConfig, +} /// Search over source metric IDs in the original census table #[derive(Clone, Debug, Deserialize, Serialize)] -pub struct SourceMetricId(pub String); +pub struct SourceMetricId { + pub value: String, + pub config: SearchConfig, +} /// This struct represents all the possible parameters one can search the metadata catalogue with. /// All parameters are optional in that they can either be empty vectors or None. @@ -291,6 +449,7 @@ pub struct SearchParams { pub geometry_level: Option, pub source_data_release: Option, pub data_publisher: Option, + pub source_download_url: Option, pub country: Option, pub source_metric_id: Option, pub region_spec: Vec, @@ -336,6 +495,7 @@ impl From for Option { value.geometry_level.map(|v| v.into()), value.source_data_release.map(|v| v.into()), value.data_publisher.map(|v| v.into()), + value.source_download_url.map(|v| v.into()), value.country.map(|v| v.into()), value.source_metric_id.map(|v| v.into()), ]; @@ -349,6 +509,9 @@ impl From for Option { // Combine IDs provided in SearchParams with OR let combined_id_expr = to_queries_then_or(value.metric_id); + debug!("{:#?}", combined_non_id_expr); + debug!("{:#?}", combined_id_expr); + // Combine ID and non-ID SearchParams with OR combine_exprs_with_or( vec![combined_non_id_expr, combined_id_expr] @@ -447,7 +610,9 @@ impl SearchResults { // TODO Handle multiple geometries if all_geom_files.len() > 1 { - unimplemented!("Multiple geometries not supported in current release"); + let err_info = "Multiple geometries not supported in current release"; + error!("{err_info}: {all_geom_files:?}"); + unimplemented!("{err_info}"); } else if all_geom_files.is_empty() { bail!( "No geometry files for the following `metric_requests`: {:#?}", @@ -500,9 +665,68 @@ impl SearchResults { #[cfg(test)] mod tests { - // use super::*; - // #[test] - // fn test_search_request() { - // let mut sr = SearchRequest{search_string: None}.with_country("a").with_country("b"); - // } + + use polars::df; + + use super::*; + + fn test_df() -> DataFrame { + df!( + COL::METRIC_HUMAN_READABLE_NAME => &["Apple", "Apple", "Pear", "apple", ".apple", "lemon"], + COL::METRIC_HXL_TAG => &["Red", "Yellow", "Green", "red", "Green", "yellow"], + COL::METRIC_DESCRIPTION => &["Red", "Yellow", "Green", "red", "Green", "yellow"], + "index" => &[0u32, 1, 2, 3, 4, 5] + ) + .unwrap() + } + + fn test_search_params( + value: &str, + match_type: MatchType, + case_sensitivity: CaseSensitivity, + ) -> SearchParams { + SearchParams { + text: vec![SearchText { + text: value.to_string(), + context: nonempty![SearchContext::HumanReadableName], + config: SearchConfig { + match_type, + case_sensitivity, + }, + }], + ..Default::default() + } + } + + fn test_from_args( + value: &str, + match_type: MatchType, + case_sensitivity: CaseSensitivity, + expected_ids: &[u32], + ) -> anyhow::Result<()> { + let df = test_df(); + let search_params = test_search_params(value, match_type, case_sensitivity); + let expr = Option::::from(search_params.clone()).unwrap(); + let filtered = df.clone().lazy().filter(expr).collect()?; + assert_eq!(filtered.select(["index"])?, df!("index" => expected_ids)?); + Ok(()) + } + + #[test] + #[rustfmt::skip] + fn test_search_request() -> anyhow::Result<()> { + // 1. Test regex, sensitive, HumanReadableName col + test_from_args("^A", MatchType::Regex, CaseSensitivity::Sensitive, &[0, 1])?; + // 2. Test regex, insensitive, HumanReadableName col + test_from_args("^A", MatchType::Regex, CaseSensitivity::Insensitive, &[0, 1, 3])?; + // 3. Test exact, insensitive, HumanReadableName col + test_from_args("Apple", MatchType::Exact, CaseSensitivity::Sensitive, &[0, 1])?; + // 4. Test exact, sensitive, HumanReadableName col + test_from_args("Apple", MatchType::Exact, CaseSensitivity::Insensitive, &[0, 1, 3])?; + // 5. Test regex (as contains), insensitive, HumanReadableName col + test_from_args("Apple", MatchType::Regex, CaseSensitivity::Sensitive, &[0, 1])?; + // 6. Test regex (as contains), insensitive, HumanReadableName col + test_from_args("Apple", MatchType::Regex, CaseSensitivity::Insensitive, &[0, 1, 3, 4])?; + Ok(()) + } } diff --git a/popgetter_cli/src/cli.rs b/popgetter_cli/src/cli.rs index 66c5cf5..e6f903a 100644 --- a/popgetter_cli/src/cli.rs +++ b/popgetter_cli/src/cli.rs @@ -14,8 +14,9 @@ use popgetter::{ }, geo::BBox, search::{ - Country, DataPublisher, DownloadParams, GeometryLevel, MetricId, Params, SearchContext, - SearchParams, SearchResults, SearchText, SourceDataRelease, SourceMetricId, YearRange, + CaseSensitivity, Country, DataPublisher, DownloadParams, GeometryLevel, MatchType, + MetricId, Params, SearchConfig, SearchContext, SearchParams, SearchText, SourceDataRelease, + SourceDownloadUrl, SourceMetricId, YearRange, }, Popgetter, }; @@ -25,7 +26,10 @@ use std::{fs::File, path::Path}; use std::{io, process}; use strum_macros::EnumString; -use crate::display::{display_countries, display_search_results}; +use crate::display::{ + display_column, display_column_unique, display_countries, display_metdata_columns, + display_search_results, display_summary, +}; const DEFAULT_PROGRESS_SPINNER: Spinners = Spinners::Dots; const COMPLETE_PROGRESS_STRING: &str = "✔"; @@ -164,7 +168,8 @@ impl RunCommand for DataCommand { s.stop_with_symbol(COMPLETE_PROGRESS_STRING) } - print_metrics_count(search_results.clone()); + let len_requests = search_results.0.shape().0; + print_metrics_count(len_requests); let download_params: DownloadParams = CombinedParamsArgs { search_params_args: self.search_params_args.clone(), download_params_args: self.download_params_args.clone(), @@ -208,16 +213,71 @@ impl RunCommand for DataCommand { /// The set of ways to search will likley increase over time #[derive(Args, Debug)] pub struct MetricsCommand { + #[command(flatten)] + search_params_args: SearchParamsArgs, + // TODO: consider refactoring SummaryOptions into a separate subcommand so that + #[clap(flatten)] + summary_options: SummaryOptions, + #[clap(flatten)] + metrics_results_options: MetricsResultsOptions, + #[arg(from_global)] + quiet: bool, +} + +#[derive(Debug, Args)] +#[group(required = false, multiple = false)] +pub struct SummaryOptions { + #[arg(long, help = "Summarise results with count of unique values by field")] + summary: bool, + #[arg(long, help = "Unique values of a column", value_name = "COLUMN NAME")] + unique: Option, + #[arg(long, help = "Values of a column", value_name = "COLUMN NAME")] + column: Option, + #[arg(long, help = "Print columns of metadata")] + display_metadata_columns: bool, +} + +#[derive(Debug, Args)] +#[group(required = false, multiple = true)] +pub struct MetricsResultsOptions { #[arg( short, long, help = "Show all metrics even if there are a large number" )] full: bool, - #[command(flatten)] - search_params_args: SearchParamsArgs, - #[arg(from_global)] - quiet: bool, + #[arg(long, help = "Exclude description from search results")] + exclude_description: bool, +} + +#[derive(Debug, Clone, clap::ValueEnum, Copy)] +enum MatchTypeArgs { + Regex, + Exact, +} + +impl From for MatchType { + fn from(value: MatchTypeArgs) -> Self { + match value { + MatchTypeArgs::Exact => MatchType::Exact, + MatchTypeArgs::Regex => MatchType::Regex, + } + } +} + +#[derive(Debug, Clone, clap::ValueEnum, Copy)] +enum CaseSensitivityArgs { + Sensitive, + Insensitive, +} + +impl From for CaseSensitivity { + fn from(value: CaseSensitivityArgs) -> Self { + match value { + CaseSensitivityArgs::Insensitive => CaseSensitivity::Insensitive, + CaseSensitivityArgs::Sensitive => CaseSensitivity::Sensitive, + } + } } /// These are the command-line arguments that can be parsed into a SearchParams. The type is @@ -253,6 +313,8 @@ struct SearchParamsArgs { release)." )] source_metric_id: Option, + #[arg(long, help = "Filter by source download URL")] + source_download_url: Option, #[arg( short = 'i', long, @@ -283,6 +345,29 @@ struct SearchParamsArgs { (EPSG:3812)." )] bbox: Option, + #[arg( + value_enum, + short = 'm', + long, + value_name = "MATCH_TYPE", + help = "\ + Type of matching to perform on: 'geometry-level', 'source-data-release',\n\ + 'publisher', 'country', 'source-metric-id', 'hxl', 'name', 'description'\n\ + arguments during the search.\n", + default_value_t=MatchTypeArgs::Exact + )] + match_type: MatchTypeArgs, + #[arg( + value_enum, + long, + value_name = "CASE_SENSITIVITY", + help = "\ + Type of case sensitivity used in matching on: 'geometry-level',\n\ + 'source-data-release', 'publisher', 'country', 'source-metric-id', 'hxl',\n\ + 'name', 'description', 'text' arguments during the search.\n", + default_value_t=CaseSensitivityArgs::Insensitive + )] + case_sensitivity: CaseSensitivityArgs, } /// Expected behaviour: @@ -298,10 +383,8 @@ fn parse_year_range(value: &str) -> Result, anyhow::Error> { // A simple function to manage similaries across multiple cases. // May ultimately be generalised to a function to manage all progress UX // that can be switched on and off. -fn print_metrics_count(search_results: SearchResults) -> usize { - let len_requests = search_results.0.shape().0; +fn print_metrics_count(len_requests: usize) { println!("Found {len_requests} metric(s)."); - len_requests } fn text_searches_from_args( @@ -309,23 +392,42 @@ fn text_searches_from_args( name: Vec, description: Vec, text: Vec, + match_type: MatchType, + case_sensitivity: CaseSensitivity, ) -> Vec { let mut all_text_searches: Vec = vec![]; all_text_searches.extend(hxl.iter().map(|t| SearchText { text: t.clone(), context: nonempty![SearchContext::Hxl], + config: SearchConfig { + match_type, + case_sensitivity, + }, })); all_text_searches.extend(name.iter().map(|t| SearchText { text: t.clone(), context: nonempty![SearchContext::HumanReadableName], + config: SearchConfig { + match_type, + case_sensitivity, + }, })); all_text_searches.extend(description.iter().map(|t| SearchText { text: t.clone(), context: nonempty![SearchContext::Description], + config: SearchConfig { + match_type, + case_sensitivity, + }, })); all_text_searches.extend(text.iter().map(|t| SearchText { text: t.clone(), context: SearchContext::all(), + config: SearchConfig { + // Always use regex for "text" since SearchContext::all() includes multiple columns + match_type: MatchType::Regex, + case_sensitivity, + }, })); all_text_searches } @@ -333,14 +435,75 @@ fn text_searches_from_args( impl From for SearchParams { fn from(args: SearchParamsArgs) -> Self { SearchParams { - text: text_searches_from_args(args.hxl, args.name, args.description, args.text), + text: text_searches_from_args( + args.hxl, + args.name, + args.description, + args.text, + args.match_type.into(), + args.case_sensitivity.into(), + ), year_range: args.year_range.clone(), - geometry_level: args.geometry_level.clone().map(GeometryLevel), - source_data_release: args.source_data_release.clone().map(SourceDataRelease), - data_publisher: args.publisher.clone().map(DataPublisher), - country: args.country.clone().map(Country), - source_metric_id: args.source_metric_id.clone().map(SourceMetricId), - metric_id: args.id.clone().into_iter().map(MetricId).collect(), + geometry_level: args.geometry_level.clone().map(|value| GeometryLevel { + value, + config: SearchConfig { + match_type: args.match_type.into(), + case_sensitivity: args.case_sensitivity.into(), + }, + }), + source_data_release: args + .source_data_release + .clone() + .map(|value| SourceDataRelease { + value, + config: SearchConfig { + match_type: args.match_type.into(), + case_sensitivity: args.case_sensitivity.into(), + }, + }), + data_publisher: args.publisher.clone().map(|value| DataPublisher { + value, + config: SearchConfig { + match_type: args.match_type.into(), + case_sensitivity: args.case_sensitivity.into(), + }, + }), + source_download_url: args.source_download_url.map(|value| SourceDownloadUrl { + value, + // Always use regex for source download URL + config: SearchConfig { + match_type: MatchType::Regex, + case_sensitivity: CaseSensitivity::Insensitive, + }, + }), + country: args.country.clone().map(|value| Country { + value, + config: SearchConfig { + match_type: args.match_type.into(), + case_sensitivity: args.case_sensitivity.into(), + }, + }), + source_metric_id: args.source_metric_id.clone().map(|value| SourceMetricId { + value, + config: SearchConfig { + match_type: args.match_type.into(), + case_sensitivity: args.case_sensitivity.into(), + }, + }), + metric_id: args + .id + .clone() + .into_iter() + .map(|value| MetricId { + id: value, + // SearchConfig always `MatchType::Startswith` and `CaseSensitivity::Insensitive` + // for `MetricId` + config: SearchConfig { + match_type: MatchType::Startswith, + case_sensitivity: CaseSensitivity::Insensitive, + }, + }) + .collect(), region_spec: args .bbox .map(|bbox| vec![RegionSpec::BoundingBox(bbox)]) @@ -361,21 +524,54 @@ impl RunCommand for MetricsCommand { ) }); let popgetter = Popgetter::new_with_config_and_cache(config).await?; + let search_results = popgetter.search(&self.search_params_args.to_owned().into()); if let Some(mut s) = sp { s.stop_with_symbol(COMPLETE_PROGRESS_STRING); } - - let len_requests = print_metrics_count(search_results.clone()); - - if len_requests > 50 && !self.full { - display_search_results(search_results, Some(50))?; - println!( - "{} more results not shown. Use --full to show all results.", - len_requests - 50 - ); + let len_requests = search_results.0.shape().0; + + // Output options: + // Display: metadata columns + if self.summary_options.display_metadata_columns { + display_metdata_columns(&popgetter.metadata.combined_metric_source_geometry())?; + // Display: summary + } else if self.summary_options.summary { + display_summary(search_results)?; + // Display: unique + } else if let Some(column) = self.summary_options.unique.as_ref() { + display_column_unique(search_results, column)?; + // Display: column + } else if let Some(column) = self.summary_options.column.as_ref() { + display_column(search_results, column)?; + // Display: metrics results } else { - display_search_results(search_results, None)?; + // MetricsResultsOptions: exclude description + let display_search_results_fn = if self.metrics_results_options.exclude_description { + // display_search_results_no_description + display_search_results + } else { + display_search_results + }; + // MetricsResultsOptions: full + if len_requests > 50 && !self.metrics_results_options.full { + print_metrics_count(len_requests); + display_search_results_fn( + search_results, + Some(50), + self.metrics_results_options.exclude_description, + )?; + println!( + "{} more results not shown. Use --full to show all results.", + len_requests - 50 + ); + } else { + display_search_results_fn( + search_results, + None, + self.metrics_results_options.exclude_description, + )?; + } } Ok(()) } diff --git a/popgetter_cli/src/display.rs b/popgetter_cli/src/display.rs index 066544f..8b30a48 100644 --- a/popgetter_cli/src/display.rs +++ b/popgetter_cli/src/display.rs @@ -1,32 +1,90 @@ +use std::collections::HashMap; +use std::io::Write; +use std::sync::OnceLock; + use comfy_table::{presets::NOTHING, *}; use itertools::izip; - use polars::{frame::DataFrame, prelude::SortMultipleOptions}; -use popgetter::{search::SearchResults, COL}; +use popgetter::{metadata::ExpandedMetadata, search::SearchResults, COL}; -pub fn display_countries(countries: DataFrame, max_results: Option) -> anyhow::Result<()> { - let df_to_show = match max_results { - Some(max) => countries.head(Some(max)), - None => countries, - }; - let df_to_show = df_to_show.sort([COL::COUNTRY_ID], SortMultipleOptions::default())?; +static LOOKUP: OnceLock> = OnceLock::new(); + +// TODO: consider adding to column_names module +fn lookup() -> &'static HashMap<&'static str, &'static str> { + LOOKUP.get_or_init(|| { + let mut hm = HashMap::new(); + hm.insert(COL::COUNTRY_ID, "Country ID"); + hm.insert(COL::COUNTRY_NAME_OFFICIAL, "Country Name (official)"); + hm.insert(COL::COUNTRY_NAME_SHORT_EN, "Country Name (short)"); + hm.insert(COL::COUNTRY_ISO3, "ISO3116-1 alpha-3"); + hm.insert(COL::COUNTRY_ISO2, "ISO3116-2"); + hm.insert(COL::METRIC_ID, "Metric ID"); + hm.insert(COL::METRIC_HUMAN_READABLE_NAME, "Human readable name"); + hm.insert(COL::METRIC_DESCRIPTION, "Description"); + hm.insert(COL::METRIC_HXL_TAG, "HXL tag"); + hm.insert( + COL::SOURCE_DATA_RELEASE_COLLECTION_PERIOD_START, + "Collection date", + ); + hm.insert(COL::COUNTRY_NAME_SHORT_EN, "Country"); + hm.insert(COL::GEOMETRY_LEVEL, "Geometry level"); + hm.insert(COL::METRIC_SOURCE_DOWNLOAD_URL, "Source download URL"); + hm + }) +} + +fn create_table(width: Option, header_columns: Option<&[&str]>) -> Table { let mut table = Table::new(); table .load_preset(NOTHING) - .set_content_arrangement(ContentArrangement::Dynamic) - .set_header(vec![ - Cell::new("Country ID").add_attribute(Attribute::Bold), - Cell::new("Country Name (official)").add_attribute(Attribute::Bold), - Cell::new("Country Name (short)").add_attribute(Attribute::Bold), - Cell::new("ISO3116-1 alpha-3").add_attribute(Attribute::Bold), - Cell::new("ISO3116-2").add_attribute(Attribute::Bold), - ]) .set_style(comfy_table::TableComponent::BottomBorder, '─') - .set_style(comfy_table::TableComponent::MiddleHeaderIntersections, '─') - .set_style(comfy_table::TableComponent::HeaderLines, '─') .set_style(comfy_table::TableComponent::BottomBorderIntersections, '─') .set_style(comfy_table::TableComponent::TopBorder, '─') .set_style(comfy_table::TableComponent::TopBorderIntersections, '─'); + + // Set width if given + match width { + Some(width) => { + table + .set_width(width) + .set_content_arrangement(ContentArrangement::DynamicFullWidth); + } + None => { + table.set_content_arrangement(ContentArrangement::Dynamic); + } + } + + // Add header if given + if let Some(columns) = header_columns { + table + .set_style(comfy_table::TableComponent::HeaderLines, '─') + .set_style(comfy_table::TableComponent::MiddleHeaderIntersections, '─') + .set_header( + columns + .iter() + .map(|col| Cell::new(col).add_attribute(Attribute::Bold)) + .collect::>(), + ); + } + table +} + +pub fn display_countries(countries: DataFrame, max_results: Option) -> anyhow::Result<()> { + let df_to_show = match max_results { + Some(max) => countries.head(Some(max)), + None => countries, + }; + let df_to_show = df_to_show.sort([COL::COUNTRY_ID], SortMultipleOptions::default())?; + let mut table = create_table( + None, + Some(&[ + lookup().get(COL::COUNTRY_ID).unwrap(), + lookup().get(COL::COUNTRY_NAME_OFFICIAL).unwrap(), + lookup().get(COL::COUNTRY_NAME_SHORT_EN).unwrap(), + lookup().get(COL::COUNTRY_ISO3).unwrap(), + lookup().get(COL::COUNTRY_ISO2).unwrap(), + ]), + ); for ( country_id, country_name_official, @@ -48,90 +106,157 @@ pub fn display_countries(countries: DataFrame, max_results: Option) -> an country_iso3116_2.unwrap_or_default(), ]); } - println!("\n{}", table); - Ok(()) + + Ok(writeln!(&mut std::io::stdout(), "\n{}", table)?) } pub fn display_search_results( results: SearchResults, max_results: Option, + exclude_description: bool, ) -> anyhow::Result<()> { - let df_to_show = match max_results { + let mut df_to_show = match max_results { Some(max) => results.0.head(Some(max)), None => results.0, }; + df_to_show.as_single_chunk_par(); - for (metric_id, hrn, desc, hxl, date, country, level, download_url) in izip!( - df_to_show.column(COL::METRIC_ID)?.str()?, - df_to_show.column(COL::METRIC_HUMAN_READABLE_NAME)?.str()?, - df_to_show.column(COL::METRIC_DESCRIPTION)?.str()?, - df_to_show.column(COL::METRIC_HXL_TAG)?.str()?, - df_to_show - .column(COL::SOURCE_DATA_RELEASE_COLLECTION_PERIOD_START)? - .rechunk() - .iter(), - df_to_show.column(COL::COUNTRY_NAME_SHORT_EN)?.str()?, - // Note: if using iter on an AnyValue, need to rechunk first. - df_to_show.column(COL::GEOMETRY_LEVEL)?.rechunk().iter(), - df_to_show - .column(COL::METRIC_SOURCE_DOWNLOAD_URL)? - .rechunk() - .iter() - ) { - let mut table = Table::new(); - table - .load_preset(NOTHING) - .set_content_arrangement(ContentArrangement::Dynamic) - .set_style(comfy_table::TableComponent::BottomBorder, '─') - .set_style(comfy_table::TableComponent::BottomBorderIntersections, '─') - .set_style(comfy_table::TableComponent::TopBorder, '─') - .set_style(comfy_table::TableComponent::TopBorderIntersections, '─') - .add_row(vec![ - Cell::new("Metric ID").add_attribute(Attribute::Bold), - metric_id.unwrap().into(), - ]) - .add_row(vec![ - Cell::new("Metric ID (short)").add_attribute(Attribute::Bold), - metric_id - .unwrap() - .chars() - .take(8) - .collect::() - .into(), - ]) - .add_row(vec![ - Cell::new("Human readable name").add_attribute(Attribute::Bold), - hrn.unwrap().into(), - ]) - .add_row(vec![ - Cell::new("Description").add_attribute(Attribute::Bold), - desc.unwrap().into(), - ]) - .add_row(vec![ - Cell::new("HXL tag").add_attribute(Attribute::Bold), - hxl.unwrap().into(), - ]) - .add_row(vec![ - Cell::new("Collection date").add_attribute(Attribute::Bold), - format!("{date}").into(), - ]) - .add_row(vec![ - Cell::new("Country").add_attribute(Attribute::Bold), - country.unwrap().into(), - ]) - .add_row(vec![ - Cell::new("Geometry level").add_attribute(Attribute::Bold), - level.get_str().unwrap().into(), - ]) - .add_row(vec![ - Cell::new("Source download URL").add_attribute(Attribute::Bold), - download_url.get_str().unwrap().into(), - ]); - - let column = table.column_mut(0).unwrap(); - column.set_cell_alignment(CellAlignment::Right); - - println!("\n{}", table); + // Set columns conditional on exclude_description arg + let mut cols = vec![ + COL::METRIC_ID, + COL::METRIC_HUMAN_READABLE_NAME, + COL::METRIC_DESCRIPTION, + COL::METRIC_HXL_TAG, + COL::SOURCE_DATA_RELEASE_COLLECTION_PERIOD_START, + COL::COUNTRY_NAME_SHORT_EN, + COL::GEOMETRY_LEVEL, + COL::METRIC_SOURCE_DOWNLOAD_URL, + ]; + if exclude_description { + cols.retain(|&col| col.ne(COL::METRIC_DESCRIPTION)); + } + // See example for iteration over SeriesIter: https://stackoverflow.com/a/72443329 + let mut iters = df_to_show + .columns(&cols)? + .iter() + .map(|s| s.iter()) + .collect::>(); + + for _ in 0..df_to_show.height() { + let mut table = create_table(Some(100), None); + for (iter, col) in iters.iter_mut().zip(cols.to_vec()) { + let value = iter.next().unwrap(); + match col { + // Format: metric ID + COL::METRIC_ID => { + table + .add_row(vec![ + Cell::new(lookup().get(col).unwrap()).add_attribute(Attribute::Bold), + value.clone().get_str().unwrap().into(), + ]) + .add_row(vec![ + Cell::new("Metric ID (short)").add_attribute(Attribute::Bold), + value + .get_str() + .unwrap() + .chars() + .take(8) + .collect::() + .into(), + ]); + } + // Format: str + COL::COUNTRY_NAME_SHORT_EN + | COL::METRIC_HUMAN_READABLE_NAME + | COL::METRIC_DESCRIPTION + | COL::METRIC_HXL_TAG + | COL::GEOMETRY_LEVEL + | COL::METRIC_SOURCE_DOWNLOAD_URL => { + table.add_row(vec![ + Cell::new(lookup().get(col).unwrap()).add_attribute(Attribute::Bold), + value.get_str().unwrap().into(), + ]); + } + // Format: dates + COL::SOURCE_DATA_RELEASE_COLLECTION_PERIOD_START => { + table.add_row(vec![ + Cell::new(lookup().get(col).unwrap()).add_attribute(Attribute::Bold), + format!("{value}").into(), + ]); + } + // No missing columns are possible since all matching should be include in columns + _ => { + unreachable!() + } + } + } + writeln!(&mut std::io::stdout(), "{}", table)?; } Ok(()) } + +pub fn display_summary(results: SearchResults) -> anyhow::Result<()> { + let df_to_show = results.0; + // Columns to summarise + let cols = [ + COL::METRIC_ID, + COL::SOURCE_DATA_RELEASE_COLLECTION_PERIOD_START, + COL::COUNTRY_NAME_SHORT_EN, + COL::GEOMETRY_LEVEL, + COL::METRIC_SOURCE_DOWNLOAD_URL, + ]; + // Get unique values of each columns + let n_uniques = cols + .iter() + .map(|col| df_to_show.column(col).and_then(|el| el.n_unique())) + .collect::, _>>()?; + + // Create table + let mut table = create_table(None, Some(&["Column", "Unique values"])); + + // Write values + for (col, n_unique) in cols.iter().copied().zip(n_uniques.into_iter()) { + table.add_row(vec![ + Cell::new(col).add_attribute(Attribute::Bold), + n_unique.into(), + ]); + } + // Alignment + let column = table.column_mut(1).unwrap(); + column.set_cell_alignment(CellAlignment::Right); + + Ok(writeln!(&mut std::io::stdout(), "\n{}", table)?) +} + +/// Display a given column from the search results +pub fn display_column(search_results: SearchResults, column: &str) -> anyhow::Result<()> { + Ok(search_results + .0 + .column(column)? + .rechunk() + .iter() + .map(|el| el.get_str().map(|s| s.to_string()).unwrap()) + .try_for_each(|el| writeln!(&mut std::io::stdout(), "{el}"))?) +} + +/// Display the unique values of a given column from the search results +pub fn display_column_unique(search_results: SearchResults, column: &str) -> anyhow::Result<()> { + Ok(search_results + .0 + .column(column)? + .unique()? + .iter() + .map(|el| el.get_str().map(|s| s.to_string()).unwrap()) + .try_for_each(|el| writeln!(&mut std::io::stdout(), "{el}"))?) +} + +/// Display the columns of the expanded metadata that can be used for displaying metrics results +/// (either whole column or unique results) +pub fn display_metdata_columns(expanded_metadata: &ExpandedMetadata) -> anyhow::Result<()> { + Ok(expanded_metadata + .as_df() + .collect()? + .get_column_names() + .into_iter() + .try_for_each(|el| writeln!(&mut std::io::stdout(), "{el}"))?) +} diff --git a/popgetter_cli/src/main.rs b/popgetter_cli/src/main.rs index 38df0b2..be79513 100644 --- a/popgetter_cli/src/main.rs +++ b/popgetter_cli/src/main.rs @@ -21,7 +21,16 @@ async fn main() -> Result<()> { debug!("config: {config:?}"); if let Some(command) = args.command { - command.run(config).await?; + // Return ok if pipe is closed instead of error, otherwise return error + // See: https://stackoverflow.com/a/65760807, https://github.com/rust-lang/rust/issues/62569 + if let Err(err) = command.run(config).await { + if let Some(err) = err.downcast_ref::() { + if err.kind() == std::io::ErrorKind::BrokenPipe { + return Ok(()); + } + } + Err(err)?; + } } Ok(()) } diff --git a/popgetter_py/README.md b/popgetter_py/README.md index d12c893..81eb7dc 100644 --- a/popgetter_py/README.md +++ b/popgetter_py/README.md @@ -49,11 +49,18 @@ search_params = { "metric_id": [], "text": [{ "text": "Key: uniqueID, Value: B01001_001;", - "context": ["Hxl", "HumanReadableName", "Description"] + "context": ["Hxl", "HumanReadableName", "Description"], + "config": {"match_type": "Regex", "case_sensitivity": "Insensitive"} }], - "geometry_level": "tract", + "geometry_level": { + "value": "tract", + "config": {"match_type": "Exact", "case_sensitivity": "Insensitive"} + }, "year_range": [{"Between": [2021, 2021]}], - "country": "USA", + "country": { + "value": "USA", + "config": {"match_type": "Regex", "case_sensitivity": "Insensitive"} + }, "region_spec": [ {"BoundingBox": [-74.251785, 40.647043, -73.673286, 40.91014]} ] @@ -75,9 +82,9 @@ data_request_spec = { {"BoundingBox": [-74.251785, 40.647043, -73.673286, 40.91014]} ], "metrics": [ - {"MetricId": "f29c1976"}, - {"MetricId": "079f3ba3"}, - {"MetricId": "81cae95d"}, + {"MetricId": {"id": "f29c1976"}}, + {"MetricId": {"id": "079f3ba3"}}, + {"MetricId": {"id": "81cae95d"}}, {"MetricText": "Key: uniqueID, Value: B01001_001;"} ], "years": ["2021"], diff --git a/popgetter_py/src/lib.rs b/popgetter_py/src/lib.rs index e3a13d4..8c281c7 100644 --- a/popgetter_py/src/lib.rs +++ b/popgetter_py/src/lib.rs @@ -3,7 +3,10 @@ use std::default::Default; use ::popgetter::{ config::Config, data_request_spec::DataRequestSpec, - search::{DownloadParams, MetricId, Params, SearchParams, SearchText}, + search::{ + CaseSensitivity, DownloadParams, MatchType, MetricId, Params, SearchConfig, SearchParams, + SearchText, + }, Popgetter, COL, }; use polars::prelude::DataFrame; @@ -80,7 +83,13 @@ fn get_search_params(obj: &Bound<'_, PyAny>) -> PyResult { metric_id: text .to_string() .split(',') - .map(|id_str| MetricId(id_str.to_string())) + .map(|id_str| MetricId { + id: id_str.to_string(), + config: SearchConfig { + match_type: MatchType::Startswith, + case_sensitivity: CaseSensitivity::Insensitive, + }, + }) .collect::>(), ..Default::default() }); diff --git a/test_recipe.json b/test_recipe.json index 0c3f117..9f4d7d5 100644 --- a/test_recipe.json +++ b/test_recipe.json @@ -6,13 +6,19 @@ ], "metrics": [ { - "MetricId": "f29c1976" + "MetricId": { + "id": "f29c1976" + } }, { - "MetricId": "079f3ba3" + "MetricId": { + "id": "079f3ba3" + } }, { - "MetricId": "81cae95d" + "MetricId": { + "id": "81cae95d" + } }, { "MetricText": "Key: uniqueID, Value: B01001_001;"