From 8787632e202d5683e40a304249bf4f67302895a1 Mon Sep 17 00:00:00 2001 From: Frank Elsinga Date: Mon, 29 Jul 2024 11:57:50 +0200 Subject: [PATCH 1/5] migrated tha aliasing logic to parquet --- server/main-api/Cargo.toml | 1 + server/main-api/src/setup/database/alias.rs | 141 ++++++++------------ server/main-api/src/setup/database/data.rs | 51 ++++--- server/main-api/src/setup/database/mod.rs | 28 +--- server/main-api/src/setup/meilisearch.rs | 1 + server/main-api/src/setup/transportation.rs | 4 +- 6 files changed, 98 insertions(+), 128 deletions(-) diff --git a/server/main-api/Cargo.toml b/server/main-api/Cargo.toml index 4d9aa5778..6500d5d81 100644 --- a/server/main-api/Cargo.toml +++ b/server/main-api/Cargo.toml @@ -69,6 +69,7 @@ actix-governor = { version = "0.5.0", features = ["logger"] } tempfile = "3.10.1" base64 = "0.22.1" time = "0.3.36" +polars = { git = "https://github.com/CommanderStorm/polars.git", branch = "serialisation-experiment", features = ["parquet", "serde", "dtype-full"] } [dev-dependencies] insta = { version = "1.39.0", features = ["yaml", "json", "redactions"] } diff --git a/server/main-api/src/setup/database/alias.rs b/server/main-api/src/setup/database/alias.rs index 02726788f..6927237e9 100644 --- a/server/main-api/src/setup/database/alias.rs +++ b/server/main-api/src/setup/database/alias.rs @@ -1,6 +1,7 @@ -use serde::Deserialize; - +use std::io::Write; use crate::limited::vec::LimitedVec; +use polars::prelude::*; +use tempfile::tempfile; #[derive(Debug)] pub(super) struct Alias { @@ -10,76 +11,6 @@ pub(super) struct Alias { visible_id: String, } -#[derive(Debug, Deserialize)] -struct AliasData { - id: String, - visible_id: Option, - aliases: Vec, - r#type: String, // what we display in the url -} -struct AliasIterator { - data: AliasData, - state: AliasIteratorState, -} -#[derive(Debug, Default, Copy, Clone, PartialEq, Eq)] -enum AliasIteratorState { - #[default] - Key, - VisibleId, - Alias(usize), - Done, -} -impl AliasIteratorState { - fn next_state(&mut self) -> Self { - match self { - Self::Key => Self::VisibleId, - Self::VisibleId => Self::Alias(0), - Self::Alias(i) => Self::Alias(*i + 1), - Self::Done => Self::Done, - } - } -} - -impl From for AliasIterator { - fn from(alias_data: AliasData) -> Self { - Self { - data: alias_data, - state: AliasIteratorState::default(), - } - } -} -impl Iterator for AliasIterator { - type Item = Alias; - fn next(&mut self) -> Option { - use AliasIteratorState as State; - let visible_id = self.data.visible_id.clone().unwrap_or(self.data.id.clone()); - let alias_len = self.data.aliases.len(); - let state = self.state; - self.state = self.state.next_state(); - match state { - State::Key => Some(Alias { - alias: self.data.id.clone(), - key: self.data.id.clone(), - r#type: self.data.r#type.clone(), - visible_id, - }), - State::VisibleId => Some(Alias { - alias: visible_id.clone(), - key: self.data.id.clone(), - r#type: self.data.r#type.clone(), - visible_id, - }), - State::Alias(index) if index < alias_len => Some(Alias { - alias: self.data.aliases[index].clone(), - key: self.data.id.clone(), - r#type: self.data.r#type.clone(), - visible_id, - }), - State::Alias(_) | State::Done => None, - } - } -} - impl Alias { async fn store( self, @@ -102,24 +33,58 @@ impl Alias { } } #[tracing::instrument] -pub async fn download_updates( - keys_which_need_updating: &LimitedVec, -) -> Result, crate::BoxedError> { +pub async fn download_updates() -> Result, crate::BoxedError> { let cdn_url = std::env::var("CDN_URL").unwrap_or_else(|_| "https://nav.tum.de/cdn".to_string()); - let aliase = reqwest::get(format!("{cdn_url}/api_data.json")) + let body = reqwest::get(format!("{cdn_url}/api_data.parquet")) .await? - .json::>() - .await? - .into_iter() - .filter(|d| { - keys_which_need_updating.is_empty() || keys_which_need_updating.0.contains(&d.id) - }) - .map(AliasIterator::from); - Ok(LimitedVec( - aliase - .flat_map(IntoIterator::into_iter) - .collect::>(), - )) + .error_for_status()? + .bytes() + .await?; + let mut aliase=Vec::::new(); + let mut file = tempfile()?; + file.write_all(&body)?; + let df = ParquetReader::new(&mut file) + .with_columns(Some(vec!["id".to_string(),"type".to_string(),"visible_id".to_string(),"aliases".to_string()])) + .finish().unwrap(); + let id_col=df.column("id")?.str()?; + let type_col=df.column("type")?.str()?; + let visible_id_col=df.column("visible_id")?.str()?; + for index in 0..id_col.len(){ + let id = id_col.get(index).unwrap(); + let r#type = type_col.get(index).unwrap(); + let visible_id=visible_id_col.get(index).unwrap(); + aliase.push(Alias{ + alias: id.to_string(), + key: id.to_string(), + r#type: r#type.to_string(), + visible_id: visible_id.to_string(), + }); + aliase.push(Alias{ + alias: visible_id.to_string(), + key: id.to_string(), + r#type: r#type.to_string(), + visible_id: visible_id.to_string(), + }); + } + + let df_expanded=df.explode(["aliases"])?; + let id_col=df_expanded.column("id")?.str()?; + let type_col=df_expanded.column("type")?.str()?; + let visible_id_col=df_expanded.column("visible_id")?.str()?; + let aliases_col=df_expanded.column("aliases")?.str()?; + for index in 0..id_col.len(){ + let alias = aliases_col.get(index).unwrap(); + let id = id_col.get(index).unwrap(); + let r#type = type_col.get(index).unwrap(); + let visible_id=visible_id_col.get(index).unwrap(); + aliase.push(Alias{ + alias: alias.to_string(), + key: id.to_string(), + r#type: r#type.to_string(), + visible_id: visible_id.to_string(), + }); + } + Ok(LimitedVec(aliase)) } #[tracing::instrument(skip(tx))] pub async fn load_all_to_db( diff --git a/server/main-api/src/setup/database/data.rs b/server/main-api/src/setup/database/data.rs index 2e07bbc65..7fe5e12b0 100644 --- a/server/main-api/src/setup/database/data.rs +++ b/server/main-api/src/setup/database/data.rs @@ -1,10 +1,12 @@ use std::collections::HashMap; use std::fmt; use std::hash::{Hash, Hasher}; - +use std::io::Write; +use polars::prelude::ParquetReader; use serde_json::Value; - +use tempfile::tempfile; use crate::limited::vec::LimitedVec; +use polars::prelude::*; #[derive(Clone)] pub(super) struct DelocalisedValues { @@ -123,19 +125,28 @@ impl DelocalisedValues { } } #[tracing::instrument] -pub async fn download_updates( - keys_which_need_updating: &LimitedVec, -) -> Result, crate::BoxedError> { +pub async fn download_updates() -> Result, crate::BoxedError> { let cdn_url = std::env::var("CDN_URL").unwrap_or_else(|_| "https://nav.tum.de/cdn".to_string()); - let tasks = reqwest::get(format!("{cdn_url}/api_data.json")) - .await? - .json::>>() + let body = reqwest::get(format!("{cdn_url}/api_data.parquet")) .await? - .into_iter() - .map(DelocalisedValues::from) - .filter(|d| keys_which_need_updating.0.contains(&d.key)) - .collect::>(); - Ok(tasks) + .error_for_status()? + .bytes() + .await?; + let mut file = tempfile()?; + file.write_all(&body)?; + let df = ParquetReader::new(&mut file).finish().unwrap(); + let mut vals = Vec::::new(); + let col_names=df.get_column_names().clone(); + for index in 0..df.get_columns()[0].len() { + let row=df.get_row(index)?; + let mut hm=HashMap::new(); + for (i,a) in row.0.into_iter().enumerate(){ + let v=serde_json::to_value(a)?; + hm.insert(col_names[i].to_string(),v); + } + vals.push(DelocalisedValues::from(hm)); + } + Ok(LimitedVec(vals)) } #[tracing::instrument(skip(tx))] pub(super) async fn load_all_to_db( @@ -150,9 +161,19 @@ pub(super) async fn load_all_to_db( #[tracing::instrument] pub async fn download_status() -> Result, crate::BoxedError> { let cdn_url = std::env::var("CDN_URL").unwrap_or_else(|_| "https://nav.tum.de/cdn".to_string()); - let tasks = reqwest::get(format!("{cdn_url}/status_data.json")) + let body = reqwest::get(format!("{cdn_url}/status_data.parquet")) .await? - .json::>() + .error_for_status()? + .bytes() .await?; + let mut file = tempfile()?; + file.write_all(&body)?; + let df = ParquetReader::new(&mut file).finish().unwrap(); + let id_col=Vec::from(df.column("id")?.str()?); + let hash_col=Vec::from(df.column("id")?.i64()?); + let tasks=id_col.into_iter().zip(hash_col).flat_map(|(id,hash)| match (id,hash) { + (Some(id),Some(hash))=>Some((id.to_string(),hash)), + _=>None, + }).collect(); Ok(LimitedVec(tasks)) } diff --git a/server/main-api/src/setup/database/mod.rs b/server/main-api/src/setup/database/mod.rs index 4b9c80629..5d92ebedb 100644 --- a/server/main-api/src/setup/database/mod.rs +++ b/server/main-api/src/setup/database/mod.rs @@ -1,4 +1,4 @@ -use tracing::{debug, debug_span, info, info_span}; +use tracing::{debug, debug_span, info}; use crate::limited::vec::LimitedVec; @@ -14,35 +14,15 @@ pub async fn setup(pool: &sqlx::PgPool) -> Result<(), crate::BoxedError> { } #[tracing::instrument(skip(pool))] pub async fn load_data(pool: &sqlx::PgPool) -> Result<(), crate::BoxedError> { - let status = data::download_status().await?.0; - let new_keys = status - .clone() - .into_iter() - .map(|(k, _)| k) - .collect::>(); - let new_hashes = status - .into_iter() - .map(|(_, h)| h) - .collect::>(); { - let _ = info_span!("deleting old data").enter(); - let mut tx = pool.begin().await?; - cleanup_deleted(&new_keys, &mut tx).await?; - tx.commit().await?; - } - let keys_which_need_updating = - find_keys_which_need_updating(pool, &new_keys, &new_hashes).await?; - if !keys_which_need_updating.is_empty() { - let _ = info_span!("loading changed data").enter(); - let data = data::download_updates(&keys_which_need_updating).await?; + let data = data::download_updates().await?; let mut tx = pool.begin().await?; data::load_all_to_db(data, &mut tx).await?; tx.commit().await?; } - if !keys_which_need_updating.is_empty() { - let _ = info_span!("loading new aliases").enter(); - let aliases = alias::download_updates(&keys_which_need_updating).await?; + { + let aliases = alias::download_updates().await?; let mut tx = pool.begin().await?; alias::load_all_to_db(aliases, &mut tx).await?; tx.commit().await?; diff --git a/server/main-api/src/setup/meilisearch.rs b/server/main-api/src/setup/meilisearch.rs index c5673998a..d46927b91 100644 --- a/server/main-api/src/setup/meilisearch.rs +++ b/server/main-api/src/setup/meilisearch.rs @@ -105,6 +105,7 @@ pub async fn load_data(client: &Client) -> Result<(), crate::BoxedError> { let cdn_url = std::env::var("CDN_URL").unwrap_or_else(|_| "https://nav.tum.de/cdn".to_string()); let documents = reqwest::get(format!("{cdn_url}/search_data.json")) .await? + .error_for_status()? .json::>() .await?; let res = entries diff --git a/server/main-api/src/setup/transportation.rs b/server/main-api/src/setup/transportation.rs index a51c87cbd..147532f3a 100644 --- a/server/main-api/src/setup/transportation.rs +++ b/server/main-api/src/setup/transportation.rs @@ -54,7 +54,9 @@ impl DBStation { #[tracing::instrument(skip(pool))] pub async fn setup(pool: &sqlx::PgPool) -> Result<(), crate::BoxedError> { let url = "https://raw.githubusercontent.com/TUM-Dev/NavigaTUM/main/data/external/results/public_transport.json"; - let transportations: Vec = reqwest::get(url).await?.json().await?; + let transportations = reqwest::get(url).await? + .error_for_status()? + .json::>().await?; let transportations = transportations.into_iter().flat_map(|s| { let id = s.station.station_id.clone(); let mut stations = vec![DBStation::from_station(s.station, None)]; From e58c200417d75092cbaa0ae5eb91c6a55c7e0836 Mon Sep 17 00:00:00 2001 From: Frank Elsinga Date: Thu, 1 Aug 2024 19:31:30 +0200 Subject: [PATCH 2/5] formatting fix --- server/main-api/src/setup/database/alias.rs | 44 ++++++++++++--------- server/main-api/src/setup/database/data.rs | 36 +++++++++-------- server/main-api/src/setup/transportation.rs | 6 ++- 3 files changed, 49 insertions(+), 37 deletions(-) diff --git a/server/main-api/src/setup/database/alias.rs b/server/main-api/src/setup/database/alias.rs index 6927237e9..adb38456a 100644 --- a/server/main-api/src/setup/database/alias.rs +++ b/server/main-api/src/setup/database/alias.rs @@ -1,6 +1,6 @@ -use std::io::Write; use crate::limited::vec::LimitedVec; use polars::prelude::*; +use std::io::Write; use tempfile::tempfile; #[derive(Debug)] @@ -40,26 +40,32 @@ pub async fn download_updates() -> Result, crate::BoxedError> .error_for_status()? .bytes() .await?; - let mut aliase=Vec::::new(); + let mut aliase = Vec::::new(); let mut file = tempfile()?; file.write_all(&body)?; let df = ParquetReader::new(&mut file) - .with_columns(Some(vec!["id".to_string(),"type".to_string(),"visible_id".to_string(),"aliases".to_string()])) - .finish().unwrap(); - let id_col=df.column("id")?.str()?; - let type_col=df.column("type")?.str()?; - let visible_id_col=df.column("visible_id")?.str()?; - for index in 0..id_col.len(){ + .with_columns(Some(vec![ + "id".to_string(), + "type".to_string(), + "visible_id".to_string(), + "aliases".to_string(), + ])) + .finish() + .unwrap(); + let id_col = df.column("id")?.str()?; + let type_col = df.column("type")?.str()?; + let visible_id_col = df.column("visible_id")?.str()?; + for index in 0..id_col.len() { let id = id_col.get(index).unwrap(); let r#type = type_col.get(index).unwrap(); - let visible_id=visible_id_col.get(index).unwrap(); - aliase.push(Alias{ + let visible_id = visible_id_col.get(index).unwrap(); + aliase.push(Alias { alias: id.to_string(), key: id.to_string(), r#type: r#type.to_string(), visible_id: visible_id.to_string(), }); - aliase.push(Alias{ + aliase.push(Alias { alias: visible_id.to_string(), key: id.to_string(), r#type: r#type.to_string(), @@ -67,17 +73,17 @@ pub async fn download_updates() -> Result, crate::BoxedError> }); } - let df_expanded=df.explode(["aliases"])?; - let id_col=df_expanded.column("id")?.str()?; - let type_col=df_expanded.column("type")?.str()?; - let visible_id_col=df_expanded.column("visible_id")?.str()?; - let aliases_col=df_expanded.column("aliases")?.str()?; - for index in 0..id_col.len(){ + let df_expanded = df.explode(["aliases"])?; + let id_col = df_expanded.column("id")?.str()?; + let type_col = df_expanded.column("type")?.str()?; + let visible_id_col = df_expanded.column("visible_id")?.str()?; + let aliases_col = df_expanded.column("aliases")?.str()?; + for index in 0..id_col.len() { let alias = aliases_col.get(index).unwrap(); let id = id_col.get(index).unwrap(); let r#type = type_col.get(index).unwrap(); - let visible_id=visible_id_col.get(index).unwrap(); - aliase.push(Alias{ + let visible_id = visible_id_col.get(index).unwrap(); + aliase.push(Alias { alias: alias.to_string(), key: id.to_string(), r#type: r#type.to_string(), diff --git a/server/main-api/src/setup/database/data.rs b/server/main-api/src/setup/database/data.rs index 7fe5e12b0..ca8258ee2 100644 --- a/server/main-api/src/setup/database/data.rs +++ b/server/main-api/src/setup/database/data.rs @@ -1,12 +1,12 @@ +use crate::limited::vec::LimitedVec; +use polars::prelude::ParquetReader; +use polars::prelude::*; +use serde_json::Value; use std::collections::HashMap; use std::fmt; use std::hash::{Hash, Hasher}; use std::io::Write; -use polars::prelude::ParquetReader; -use serde_json::Value; use tempfile::tempfile; -use crate::limited::vec::LimitedVec; -use polars::prelude::*; #[derive(Clone)] pub(super) struct DelocalisedValues { @@ -136,13 +136,13 @@ pub async fn download_updates() -> Result, crate:: file.write_all(&body)?; let df = ParquetReader::new(&mut file).finish().unwrap(); let mut vals = Vec::::new(); - let col_names=df.get_column_names().clone(); + let col_names = df.get_column_names().clone(); for index in 0..df.get_columns()[0].len() { - let row=df.get_row(index)?; - let mut hm=HashMap::new(); - for (i,a) in row.0.into_iter().enumerate(){ - let v=serde_json::to_value(a)?; - hm.insert(col_names[i].to_string(),v); + let row = df.get_row(index)?; + let mut hm = HashMap::new(); + for (i, a) in row.0.into_iter().enumerate() { + let v = serde_json::to_value(a)?; + hm.insert(col_names[i].to_string(), v); } vals.push(DelocalisedValues::from(hm)); } @@ -169,11 +169,15 @@ pub async fn download_status() -> Result, crate::Boxed let mut file = tempfile()?; file.write_all(&body)?; let df = ParquetReader::new(&mut file).finish().unwrap(); - let id_col=Vec::from(df.column("id")?.str()?); - let hash_col=Vec::from(df.column("id")?.i64()?); - let tasks=id_col.into_iter().zip(hash_col).flat_map(|(id,hash)| match (id,hash) { - (Some(id),Some(hash))=>Some((id.to_string(),hash)), - _=>None, - }).collect(); + let id_col = Vec::from(df.column("id")?.str()?); + let hash_col = Vec::from(df.column("id")?.i64()?); + let tasks = id_col + .into_iter() + .zip(hash_col) + .flat_map(|(id, hash)| match (id, hash) { + (Some(id), Some(hash)) => Some((id.to_string(), hash)), + _ => None, + }) + .collect(); Ok(LimitedVec(tasks)) } diff --git a/server/main-api/src/setup/transportation.rs b/server/main-api/src/setup/transportation.rs index 147532f3a..ed47f95e9 100644 --- a/server/main-api/src/setup/transportation.rs +++ b/server/main-api/src/setup/transportation.rs @@ -54,9 +54,11 @@ impl DBStation { #[tracing::instrument(skip(pool))] pub async fn setup(pool: &sqlx::PgPool) -> Result<(), crate::BoxedError> { let url = "https://raw.githubusercontent.com/TUM-Dev/NavigaTUM/main/data/external/results/public_transport.json"; - let transportations = reqwest::get(url).await? + let transportations = reqwest::get(url) + .await? .error_for_status()? - .json::>().await?; + .json::>() + .await?; let transportations = transportations.into_iter().flat_map(|s| { let id = s.station.station_id.clone(); let mut stations = vec![DBStation::from_station(s.station, None)]; From 7bdad4bce5d0d85416d3ac2b00a76a9dd8b86a3a Mon Sep 17 00:00:00 2001 From: Frank Elsinga Date: Sun, 4 Aug 2024 01:47:33 +0200 Subject: [PATCH 3/5] reverted the data update to use json again --- server/main-api/Cargo.toml | 3 +- server/main-api/src/setup/database/data.rs | 47 ++++++++-------------- server/main-api/src/setup/database/mod.rs | 15 +++++-- 3 files changed, 30 insertions(+), 35 deletions(-) diff --git a/server/main-api/Cargo.toml b/server/main-api/Cargo.toml index 6500d5d81..30cbc7e05 100644 --- a/server/main-api/Cargo.toml +++ b/server/main-api/Cargo.toml @@ -69,7 +69,8 @@ actix-governor = { version = "0.5.0", features = ["logger"] } tempfile = "3.10.1" base64 = "0.22.1" time = "0.3.36" -polars = { git = "https://github.com/CommanderStorm/polars.git", branch = "serialisation-experiment", features = ["parquet", "serde", "dtype-full"] } +polars = { version = "0.41.3", features = ["parquet", "dtype-struct"] } +#polars = { git = "https://github.com/CommanderStorm/polars.git", branch = "serialisation-experiment", features = ["parquet", "serde", "dtype-full"] } [dev-dependencies] insta = { version = "1.39.0", features = ["yaml", "json", "redactions"] } diff --git a/server/main-api/src/setup/database/data.rs b/server/main-api/src/setup/database/data.rs index ca8258ee2..efc9754db 100644 --- a/server/main-api/src/setup/database/data.rs +++ b/server/main-api/src/setup/database/data.rs @@ -125,28 +125,19 @@ impl DelocalisedValues { } } #[tracing::instrument] -pub async fn download_updates() -> Result, crate::BoxedError> { +pub async fn download_updates( + keys_which_need_updating: &LimitedVec, +) -> Result, crate::BoxedError> { let cdn_url = std::env::var("CDN_URL").unwrap_or_else(|_| "https://nav.tum.de/cdn".to_string()); - let body = reqwest::get(format!("{cdn_url}/api_data.parquet")) + let tasks = reqwest::get(format!("{cdn_url}/api_data.json")) .await? - .error_for_status()? - .bytes() - .await?; - let mut file = tempfile()?; - file.write_all(&body)?; - let df = ParquetReader::new(&mut file).finish().unwrap(); - let mut vals = Vec::::new(); - let col_names = df.get_column_names().clone(); - for index in 0..df.get_columns()[0].len() { - let row = df.get_row(index)?; - let mut hm = HashMap::new(); - for (i, a) in row.0.into_iter().enumerate() { - let v = serde_json::to_value(a)?; - hm.insert(col_names[i].to_string(), v); - } - vals.push(DelocalisedValues::from(hm)); - } - Ok(LimitedVec(vals)) + .json::>>() + .await? + .into_iter() + .map(DelocalisedValues::from) + .filter(|d| keys_which_need_updating.0.contains(&d.key)) + .collect::>(); + Ok(tasks) } #[tracing::instrument(skip(tx))] pub(super) async fn load_all_to_db( @@ -159,7 +150,7 @@ pub(super) async fn load_all_to_db( Ok(()) } #[tracing::instrument] -pub async fn download_status() -> Result, crate::BoxedError> { +pub async fn download_status() -> Result<(LimitedVec,LimitedVec), crate::BoxedError> { let cdn_url = std::env::var("CDN_URL").unwrap_or_else(|_| "https://nav.tum.de/cdn".to_string()); let body = reqwest::get(format!("{cdn_url}/status_data.parquet")) .await? @@ -170,14 +161,8 @@ pub async fn download_status() -> Result, crate::Boxed file.write_all(&body)?; let df = ParquetReader::new(&mut file).finish().unwrap(); let id_col = Vec::from(df.column("id")?.str()?); - let hash_col = Vec::from(df.column("id")?.i64()?); - let tasks = id_col - .into_iter() - .zip(hash_col) - .flat_map(|(id, hash)| match (id, hash) { - (Some(id), Some(hash)) => Some((id.to_string(), hash)), - _ => None, - }) - .collect(); - Ok(LimitedVec(tasks)) + let id_col=id_col.into_iter().filter_map(|s| s.map(String::from)).collect(); + let hash_col = Vec::from(df.column("hash")?.i64()?); + let hash_col = hash_col.into_iter().flatten().collect(); + Ok((LimitedVec(id_col), LimitedVec(hash_col))) } diff --git a/server/main-api/src/setup/database/mod.rs b/server/main-api/src/setup/database/mod.rs index 5d92ebedb..9f18d4439 100644 --- a/server/main-api/src/setup/database/mod.rs +++ b/server/main-api/src/setup/database/mod.rs @@ -1,4 +1,4 @@ -use tracing::{debug, debug_span, info}; +use tracing::{debug, debug_span, info, info_span}; use crate::limited::vec::LimitedVec; @@ -14,13 +14,22 @@ pub async fn setup(pool: &sqlx::PgPool) -> Result<(), crate::BoxedError> { } #[tracing::instrument(skip(pool))] pub async fn load_data(pool: &sqlx::PgPool) -> Result<(), crate::BoxedError> { + let (new_keys,new_hashes) = data::download_status().await?; { - let data = data::download_updates().await?; + let _ = info_span!("deleting old data").enter(); + let mut tx = pool.begin().await?; + cleanup_deleted(&new_keys, &mut tx).await?; + tx.commit().await?; + } + let keys_which_need_updating = + find_keys_which_need_updating(pool, &new_keys, &new_hashes).await?; + if !keys_which_need_updating.is_empty() { + let _ = info_span!("loading changed data").enter(); + let data = data::download_updates(&keys_which_need_updating).await?; let mut tx = pool.begin().await?; data::load_all_to_db(data, &mut tx).await?; tx.commit().await?; } - { let aliases = alias::download_updates().await?; let mut tx = pool.begin().await?; From 7e2ae9dd88ab32284029295132f1ae29805830f3 Mon Sep 17 00:00:00 2001 From: Frank Elsinga Date: Sun, 4 Aug 2024 02:51:23 +0200 Subject: [PATCH 4/5] finalised the parquet based intitalisations first steps --- server/Cargo.lock | 1015 ++++++++++++++++++- server/main-api/src/setup/database/alias.rs | 22 +- server/main-api/src/setup/database/data.rs | 7 +- server/main-api/src/setup/database/mod.rs | 2 +- 4 files changed, 983 insertions(+), 63 deletions(-) diff --git a/server/Cargo.lock b/server/Cargo.lock index e89300e2f..8e321a5db 100644 --- a/server/Cargo.lock +++ b/server/Cargo.lock @@ -76,7 +76,7 @@ dependencies = [ "ahash 0.8.11", "base64 0.22.1", "bitflags 2.6.0", - "brotli", + "brotli 6.0.0", "bytes", "bytestring", "derive_more", @@ -109,7 +109,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e01ed3140b2f8d422c68afa1ed2e85d996ea619c988ac834d255db32138655cb" dependencies = [ "quote", - "syn", + "syn 2.0.72", ] [[package]] @@ -223,7 +223,7 @@ dependencies = [ "actix-router", "proc-macro2", "quote", - "syn", + "syn 2.0.72", ] [[package]] @@ -366,9 +366,24 @@ checksum = "0ae92a5119aa49cdbcf6b9f893fe4e1d98b04ccbf82ee0584ad948a44a734dea" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.72", ] +[[package]] +name = "argminmax" +version = "0.6.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "52424b59d69d69d5056d508b260553afd91c57e21849579cd1f50ee8b8b88eaa" +dependencies = [ + "num-traits", +] + +[[package]] +name = "array-init-cursor" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bf7d0a018de4f6aa429b9d33d69edf69072b1c5b1cb8d3e4a5f7ef898fc3eb76" + [[package]] name = "arrayvec" version = "0.7.4" @@ -381,7 +396,7 @@ version = "0.4.12" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fec134f64e2bc57411226dfc4e52dec859ddfc7e711fc5e07b612584f000e4aa" dependencies = [ - "brotli", + "brotli 6.0.0", "flate2", "futures-core", "memchr", @@ -391,6 +406,28 @@ dependencies = [ "zstd-safe", ] +[[package]] +name = "async-stream" +version = "0.3.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cd56dd203fef61ac097dd65721a419ddccb106b2d2b70ba60a6b529f03961a51" +dependencies = [ + "async-stream-impl", + "futures-core", + "pin-project-lite", +] + +[[package]] +name = "async-stream-impl" +version = "0.3.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "16e62a023e7c117e27523144c5d2459f4397fcc3cab0085af8e2224f643a0193" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.72", +] + [[package]] name = "async-trait" version = "0.1.81" @@ -399,7 +436,7 @@ checksum = "6e0c28dcc82d7c8ead5cb13beb15405b57b8546e93215673ff8ca0349a028107" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.72", ] [[package]] @@ -411,6 +448,12 @@ dependencies = [ "num-traits", ] +[[package]] +name = "atoi_simd" +version = "0.15.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ae037714f313c1353189ead58ef9eec30a8e8dc101b2622d461418fd59e28a9" + [[package]] name = "atomic-waker" version = "1.1.2" @@ -537,7 +580,7 @@ dependencies = [ "regex", "rustc-hash", "shlex", - "syn", + "syn 2.0.72", "which", ] @@ -627,6 +670,17 @@ dependencies = [ "serde_with", ] +[[package]] +name = "brotli" +version = "5.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "19483b140a7ac7174d34b5a581b406c64f84da5409d3e09cf4fff604f9270e67" +dependencies = [ + "alloc-no-stdlib", + "alloc-stdlib", + "brotli-decompressor", +] + [[package]] name = "brotli" version = "6.0.0" @@ -665,6 +719,20 @@ name = "bytemuck" version = "1.16.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "102087e286b4677862ea56cf8fc58bb2cdfa8725c40ffb80fe3a008eb7f2fc83" +dependencies = [ + "bytemuck_derive", +] + +[[package]] +name = "bytemuck_derive" +version = "1.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1ee891b04274a59bd38b412188e24b849617b2e45a0fd8d057deb63e7403761b" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.72", +] [[package]] name = "byteorder" @@ -724,7 +792,7 @@ dependencies = [ "darling", "proc-macro2", "quote", - "syn", + "syn 2.0.72", ] [[package]] @@ -783,6 +851,28 @@ dependencies = [ "windows-targets 0.52.6", ] +[[package]] +name = "chrono-tz" +version = "0.8.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d59ae0466b83e838b81a54256c39d5d7c20b9d7daa10510a242d9b75abd5936e" +dependencies = [ + "chrono", + "chrono-tz-build", + "phf", +] + +[[package]] +name = "chrono-tz-build" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "433e39f13c9a060046954e0592a8d0a4bcb1040125cbf91cb8ee58964cfb350f" +dependencies = [ + "parse-zoneinfo", + "phf", + "phf_codegen", +] + [[package]] name = "clang-sys" version = "1.8.1" @@ -809,6 +899,18 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3d7b894f5411737b7867f4827955924d7c254fc9f4d91a6aad6b097804b1018b" +[[package]] +name = "comfy-table" +version = "7.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b34115915337defe99b2aff5c2ce6771e5fbc4079f4b506301f5cf394c8452f7" +dependencies = [ + "crossterm", + "strum", + "strum_macros", + "unicode-width", +] + [[package]] name = "concurrent-queue" version = "2.5.0" @@ -926,6 +1028,15 @@ dependencies = [ "cfg-if", ] +[[package]] +name = "crossbeam-channel" +version = "0.5.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "33480d6946193aa8033910124896ca395333cae7e2d1113d1fef6c3272217df2" +dependencies = [ + "crossbeam-utils", +] + [[package]] name = "crossbeam-deque" version = "0.8.5" @@ -960,6 +1071,28 @@ version = "0.8.20" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "22ec99545bb0ed0ea7bb9b8e1e9122ea386ff8a48c0922e43f36d45ab09e0e80" +[[package]] +name = "crossterm" +version = "0.27.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f476fe445d41c9e991fd07515a6f463074b782242ccf4a5b7b1d1012e70824df" +dependencies = [ + "bitflags 2.6.0", + "crossterm_winapi", + "libc", + "parking_lot 0.12.3", + "winapi", +] + +[[package]] +name = "crossterm_winapi" +version = "0.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "acdd7c62a3665c7f6830a51635d9ac9b23ed385797f70a83bb8bafe9c572ab2b" +dependencies = [ + "winapi", +] + [[package]] name = "crunchy" version = "0.2.2" @@ -997,7 +1130,7 @@ dependencies = [ "proc-macro2", "quote", "strsim", - "syn", + "syn 2.0.72", ] [[package]] @@ -1008,7 +1141,7 @@ checksum = "d336a2a514f6ccccaa3e09b02d41d35330c07ddf03a62165fcec10bb561c7806" dependencies = [ "darling_core", "quote", - "syn", + "syn 2.0.72", ] [[package]] @@ -1071,7 +1204,7 @@ dependencies = [ "proc-macro2", "quote", "rustc_version", - "syn", + "syn 2.0.72", ] [[package]] @@ -1145,6 +1278,12 @@ version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "56ce8c6da7551ec6c462cbaf3bfbc75131ebbfa1c944aeaa9dab51ca1c5f0c3b" +[[package]] +name = "dyn-clone" +version = "1.0.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0d6ef0072f8a535281e4876be788938b528e9a1d43900b82c2569af7da799125" + [[package]] name = "either" version = "1.13.0" @@ -1178,7 +1317,19 @@ dependencies = [ "heck 0.4.1", "proc-macro2", "quote", - "syn", + "syn 2.0.72", +] + +[[package]] +name = "enum_dispatch" +version = "0.3.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aa18ce2bc66555b3218614519ac839ddb759a7d6720732f979ef8d13be147ecd" +dependencies = [ + "once_cell", + "proc-macro2", + "quote", + "syn 2.0.72", ] [[package]] @@ -1208,6 +1359,12 @@ dependencies = [ "windows-sys 0.48.0", ] +[[package]] +name = "ethnum" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b90ca2580b73ab6a1f724b76ca11ab632df820fd6040c336200d2c1df7b3c82c" + [[package]] name = "event-listener" version = "5.3.1" @@ -1235,6 +1392,18 @@ dependencies = [ "zune-inflate", ] +[[package]] +name = "fallible-streaming-iterator" +version = "0.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7360491ce676a36bf9bb3c56c1aa791658183a54d2744120f27285738d90465a" + +[[package]] +name = "fast-float" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "95765f67b4b18863968b4a1bd5bb576f732b29a4a28c7cd84c09fa3e2875f33c" + [[package]] name = "fastrand" version = "2.1.0" @@ -1304,6 +1473,12 @@ version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "00b0228411908ca8685dba7fc2cdd70ec9990a6e753e89b6ac91a84c40fbaf4b" +[[package]] +name = "foreign_vec" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ee1b05cbd864bcaecbd3455d6d967862d446e4ebfc3c2e5e5b9841e53cba6673" + [[package]] name = "form_urlencoded" version = "1.2.1" @@ -1396,7 +1571,7 @@ checksum = "87750cf4b7a4c0625b1529e4c543c2182106e4dedc60a2a6455e00d212c489ac" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.72", ] [[package]] @@ -1574,6 +1749,8 @@ checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1" dependencies = [ "ahash 0.8.11", "allocator-api2", + "rayon", + "serde", ] [[package]] @@ -2096,7 +2273,7 @@ checksum = "c34819042dc3d3971c46c2190835914dfbe0c3c13f61449b2997f4e9722dfa60" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.72", ] [[package]] @@ -2150,6 +2327,12 @@ version = "1.0.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "49f1f14873335454500d59611f1cf4a4b0f786f9ac11f4312a78e4cf2566695b" +[[package]] +name = "itoap" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9028f49264629065d057f340a86acb84867925865f73bbf8d47b4d149a7e88b8" + [[package]] name = "jobserver" version = "0.1.32" @@ -2336,7 +2519,7 @@ dependencies = [ "proc-macro2", "quote", "regex-syntax 0.8.4", - "syn", + "syn 2.0.72", ] [[package]] @@ -2375,6 +2558,26 @@ dependencies = [ "linked-hash-map", ] +[[package]] +name = "lz4" +version = "1.26.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "958b4caa893816eea05507c20cfe47574a43d9a697138a7872990bba8a0ece68" +dependencies = [ + "libc", + "lz4-sys", +] + +[[package]] +name = "lz4-sys" +version = "1.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "109de74d5d2353660401699a4174a4ff23fcc649caf553df71933c7fb45ad868" +dependencies = [ + "cc", + "libc", +] + [[package]] name = "match_cfg" version = "0.1.0" @@ -2429,7 +2632,7 @@ dependencies = [ "proc-macro2", "quote", "structmeta", - "syn", + "syn 2.0.72", ] [[package]] @@ -2465,6 +2668,15 @@ version = "2.7.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3" +[[package]] +name = "memmap2" +version = "0.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f49388d20533534cd19360ad3d6a7dadc885944aa802ba3995040c5ec11288c6" +dependencies = [ + "libc", +] + [[package]] name = "mime" version = "0.3.17" @@ -2517,6 +2729,28 @@ version = "1.12.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c9be0862c1b3f26a88803c4a49de6889c10e608b3ee9344e6ef5b45fb37ad3d1" +[[package]] +name = "multiversion" +version = "0.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c4851161a11d3ad0bf9402d90ffc3967bf231768bfd7aeb61755ad06dbf1a142" +dependencies = [ + "multiversion-macros", + "target-features", +] + +[[package]] +name = "multiversion-macros" +version = "0.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "79a74ddee9e0c27d2578323c13905793e91622148f138ba29738f9dddb835e90" +dependencies = [ + "proc-macro2", + "quote", + "syn 1.0.109", + "target-features", +] + [[package]] name = "mutually_exclusive_features" version = "0.0.3" @@ -2577,6 +2811,7 @@ dependencies = [ "meilisearch-sdk", "oauth2", "octocrab", + "polars", "pretty_assertions", "rand", "regex", @@ -2635,6 +2870,24 @@ version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0676bb32a98c1a483ce53e500a81ad9c3d5b3f7c920c28c24e9cb0980d0b5bc8" +[[package]] +name = "now" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6d89e9874397a1f0a52fc1f197a8effd9735223cb2390e9dcc83ac6cd02923d0" +dependencies = [ + "chrono", +] + +[[package]] +name = "ntapi" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e8a3895c6391c39d7fe7ebc444a87eb2991b2a0bc718fdabd071eec617fc68e4" +dependencies = [ + "winapi", +] + [[package]] name = "nu-ansi-term" version = "0.46.0" @@ -2709,7 +2962,7 @@ checksum = "ed3955f1a9c7c0c15e092f9c887db08b1fc683305fdf6eb6684f22555355e202" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.72", ] [[package]] @@ -2847,7 +3100,7 @@ checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.72", ] [[package]] @@ -2954,6 +3207,16 @@ dependencies = [ "windows-targets 0.52.6", ] +[[package]] +name = "parquet-format-safe" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1131c54b167dd4e4799ce762e1ab01549ebb94d5bdd13e6ec1b467491c378e1f" +dependencies = [ + "async-trait", + "futures", +] + [[package]] name = "parse-display" version = "0.9.1" @@ -2976,7 +3239,16 @@ dependencies = [ "regex", "regex-syntax 0.8.4", "structmeta", - "syn", + "syn 2.0.72", +] + +[[package]] +name = "parse-zoneinfo" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1f2a05b18d44e2957b88f96ba460715e295bc1d7510468a2f3d3b44535d26c24" +dependencies = [ + "regex", ] [[package]] @@ -3041,7 +3313,7 @@ dependencies = [ "pest_meta", "proc-macro2", "quote", - "syn", + "syn 2.0.72", ] [[package]] @@ -3055,6 +3327,44 @@ dependencies = [ "sha2", ] +[[package]] +name = "phf" +version = "0.11.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ade2d8b8f33c7333b51bcf0428d37e217e9f32192ae4772156f65063b8ce03dc" +dependencies = [ + "phf_shared", +] + +[[package]] +name = "phf_codegen" +version = "0.11.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e8d39688d359e6b34654d328e262234662d16cc0f60ec8dcbe5e718709342a5a" +dependencies = [ + "phf_generator", + "phf_shared", +] + +[[package]] +name = "phf_generator" +version = "0.11.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "48e4cc64c2ad9ebe670cb8fd69dd50ae301650392e81c05f9bfcb2d5bdbc24b0" +dependencies = [ + "phf_shared", + "rand", +] + +[[package]] +name = "phf_shared" +version = "0.11.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "90fcb95eef784c2ac79119d1dd819e162b5da872ce6f3c3abe1e8ca1c082f72b" +dependencies = [ + "siphasher", +] + [[package]] name = "pin-project" version = "1.1.5" @@ -3072,7 +3382,7 @@ checksum = "2f38a4412a78282e09a2cf38d195ea5420d15ba0602cb375210efbc877243965" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.72", ] [[package]] @@ -3114,6 +3424,15 @@ version = "0.3.30" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d231b230927b5e4ad203db57bbcbee2802f6bce620b1e4a9024a07d94e2907ec" +[[package]] +name = "planus" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fc1691dd09e82f428ce8d6310bd6d5da2557c82ff17694d2a32cad7242aea89f" +dependencies = [ + "array-init-cursor", +] + [[package]] name = "png" version = "0.17.13" @@ -3128,34 +3447,444 @@ dependencies = [ ] [[package]] -name = "portable-atomic" -version = "1.7.0" +name = "polars" +version = "0.41.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "da544ee218f0d287a911e9c99a39a8c9bc8fcad3cb8db5959940044ecfc67265" +checksum = "8e3351ea4570e54cd556e6755b78fe7a2c85368d820c0307cca73c96e796a7ba" +dependencies = [ + "getrandom", + "polars-arrow", + "polars-core", + "polars-error", + "polars-io", + "polars-lazy", + "polars-ops", + "polars-parquet", + "polars-sql", + "polars-time", + "polars-utils", + "version_check", +] [[package]] -name = "powerfmt" -version = "0.2.0" +name = "polars-arrow" +version = "0.41.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "439ee305def115ba05938db6eb1644ff94165c5ab5e9420d1c1bcedbba909391" +checksum = "ba65fc4bcabbd64fca01fd30e759f8b2043f0963c57619e331d4b534576c0b47" +dependencies = [ + "ahash 0.8.11", + "atoi", + "atoi_simd", + "bytemuck", + "chrono", + "chrono-tz", + "dyn-clone", + "either", + "ethnum", + "fast-float", + "foreign_vec", + "futures", + "getrandom", + "hashbrown 0.14.5", + "itoa", + "itoap", + "lz4", + "multiversion", + "num-traits", + "polars-arrow-format", + "polars-error", + "polars-utils", + "ryu", + "simdutf8", + "streaming-iterator", + "strength_reduce", + "version_check", + "zstd", +] [[package]] -name = "ppv-lite86" -version = "0.2.20" +name = "polars-arrow-format" +version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "77957b295656769bb8ad2b6a6b09d897d94f05c41b069aede1fcdaa675eaea04" +checksum = "19b0ef2474af9396b19025b189d96e992311e6a47f90c53cd998b36c4c64b84c" dependencies = [ - "zerocopy", + "planus", + "serde", ] [[package]] -name = "pretty_assertions" -version = "1.4.0" +name = "polars-compute" +version = "0.41.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "af7cee1a6c8a5b9208b3cb1061f10c0cb689087b3d8ce85fb9d2dd7a29b6ba66" +checksum = "9f099516af30ac9ae4b4480f4ad02aa017d624f2f37b7a16ad4e9ba52f7e5269" dependencies = [ - "diff", - "yansi", + "bytemuck", + "either", + "num-traits", + "polars-arrow", + "polars-error", + "polars-utils", + "strength_reduce", + "version_check", +] + +[[package]] +name = "polars-core" +version = "0.41.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b2439484be228b8c302328e2f953e64cfd93930636e5c7ceed90339ece7fef6c" +dependencies = [ + "ahash 0.8.11", + "bitflags 2.6.0", + "bytemuck", + "chrono", + "chrono-tz", + "comfy-table", + "either", + "hashbrown 0.14.5", + "indexmap 2.3.0", + "num-traits", + "once_cell", + "polars-arrow", + "polars-compute", + "polars-error", + "polars-row", + "polars-utils", + "rand", + "rand_distr", + "rayon", + "regex", + "smartstring", + "thiserror", + "version_check", + "xxhash-rust", +] + +[[package]] +name = "polars-error" +version = "0.41.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0c9b06dfbe79cabe50a7f0a90396864b5ee2c0e0f8d6a9353b2343c29c56e937" +dependencies = [ + "polars-arrow-format", + "regex", + "simdutf8", + "thiserror", +] + +[[package]] +name = "polars-expr" +version = "0.41.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d9c630385a56a867c410a20f30772d088f90ec3d004864562b84250b35268f97" +dependencies = [ + "ahash 0.8.11", + "bitflags 2.6.0", + "once_cell", + "polars-arrow", + "polars-core", + "polars-io", + "polars-ops", + "polars-plan", + "polars-time", + "polars-utils", + "rayon", + "smartstring", +] + +[[package]] +name = "polars-io" +version = "0.41.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9d7363cd14e4696a28b334a56bd11013ff49cc96064818ab3f91a126e453462d" +dependencies = [ + "ahash 0.8.11", + "async-trait", + "atoi_simd", + "bytes", + "chrono", + "fast-float", + "futures", + "home", + "itoa", + "memchr", + "memmap2", + "num-traits", + "once_cell", + "percent-encoding", + "polars-arrow", + "polars-core", + "polars-error", + "polars-parquet", + "polars-time", + "polars-utils", + "rayon", + "regex", + "ryu", + "simdutf8", + "smartstring", + "tokio", + "tokio-util", +] + +[[package]] +name = "polars-lazy" +version = "0.41.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "03877e74e42b5340ae52ded705f6d5d14563d90554c9177b01b91ed2412a56ed" +dependencies = [ + "ahash 0.8.11", + "bitflags 2.6.0", + "glob", + "memchr", + "once_cell", + "polars-arrow", + "polars-core", + "polars-expr", + "polars-io", + "polars-mem-engine", + "polars-ops", + "polars-pipe", + "polars-plan", + "polars-time", + "polars-utils", + "rayon", + "smartstring", + "version_check", +] + +[[package]] +name = "polars-mem-engine" +version = "0.41.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dea9e17771af750c94bf959885e4b3f5b14149576c62ef3ec1c9ef5827b2a30f" +dependencies = [ + "polars-arrow", + "polars-core", + "polars-error", + "polars-expr", + "polars-io", + "polars-ops", + "polars-plan", + "polars-time", + "polars-utils", + "rayon", +] + +[[package]] +name = "polars-ops" +version = "0.41.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6066552eb577d43b307027fb38096910b643ffb2c89a21628c7e41caf57848d0" +dependencies = [ + "ahash 0.8.11", + "argminmax", + "base64 0.22.1", + "bytemuck", + "chrono", + "chrono-tz", + "either", + "hashbrown 0.14.5", + "hex", + "indexmap 2.3.0", + "memchr", + "num-traits", + "polars-arrow", + "polars-compute", + "polars-core", + "polars-error", + "polars-utils", + "rayon", + "regex", + "smartstring", + "unicode-reverse", + "version_check", +] + +[[package]] +name = "polars-parquet" +version = "0.41.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b35b2592a2e7ef7ce9942dc2120dc4576142626c0e661668e4c6b805042e461" +dependencies = [ + "ahash 0.8.11", + "async-stream", + "base64 0.22.1", + "brotli 5.0.0", + "ethnum", + "flate2", + "futures", + "lz4", + "num-traits", + "parquet-format-safe", + "polars-arrow", + "polars-compute", + "polars-error", + "polars-utils", + "simdutf8", + "snap", + "streaming-decompression", + "zstd", +] + +[[package]] +name = "polars-pipe" +version = "0.41.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "021bce7768c330687d735340395a77453aa18dd70d57c184cbb302311e87c1b9" +dependencies = [ + "crossbeam-channel", + "crossbeam-queue", + "enum_dispatch", + "hashbrown 0.14.5", + "num-traits", + "polars-arrow", + "polars-compute", + "polars-core", + "polars-expr", + "polars-io", + "polars-ops", + "polars-plan", + "polars-row", + "polars-utils", + "rayon", + "smartstring", + "uuid", + "version_check", +] + +[[package]] +name = "polars-plan" +version = "0.41.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "220d0d7c02d1c4375802b2813dbedcd1a184df39c43b74689e729ede8d5c2921" +dependencies = [ + "ahash 0.8.11", + "bytemuck", + "chrono-tz", + "either", + "hashbrown 0.14.5", + "once_cell", + "percent-encoding", + "polars-arrow", + "polars-core", + "polars-io", + "polars-ops", + "polars-parquet", + "polars-time", + "polars-utils", + "rayon", + "recursive", + "regex", + "smartstring", + "strum_macros", + "version_check", +] + +[[package]] +name = "polars-row" +version = "0.41.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c1d70d87a2882a64a43b431aea1329cb9a2c4100547c95c417cc426bb82408b3" +dependencies = [ + "bytemuck", + "polars-arrow", + "polars-error", + "polars-utils", +] + +[[package]] +name = "polars-sql" +version = "0.41.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a6fc1c9b778862f09f4a347f768dfdd3d0ba9957499d306d83c7103e0fa8dc5b" +dependencies = [ + "hex", + "once_cell", + "polars-arrow", + "polars-core", + "polars-error", + "polars-lazy", + "polars-ops", + "polars-plan", + "polars-time", + "rand", + "serde", + "serde_json", + "sqlparser", +] + +[[package]] +name = "polars-time" +version = "0.41.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "179f98313a15c0bfdbc8cc0f1d3076d08d567485b9952d46439f94fbc3085df5" +dependencies = [ + "atoi", + "bytemuck", + "chrono", + "chrono-tz", + "now", + "once_cell", + "polars-arrow", + "polars-core", + "polars-error", + "polars-ops", + "polars-utils", + "regex", + "smartstring", +] + +[[package]] +name = "polars-utils" +version = "0.41.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "53e6dd89fcccb1ec1a62f752c9a9f2d482a85e9255153f46efecc617b4996d50" +dependencies = [ + "ahash 0.8.11", + "bytemuck", + "hashbrown 0.14.5", + "indexmap 2.3.0", + "num-traits", + "once_cell", + "polars-error", + "raw-cpuid", + "rayon", + "smartstring", + "stacker", + "sysinfo", + "version_check", +] + +[[package]] +name = "portable-atomic" +version = "1.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "da544ee218f0d287a911e9c99a39a8c9bc8fcad3cb8db5959940044ecfc67265" + +[[package]] +name = "powerfmt" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "439ee305def115ba05938db6eb1644ff94165c5ab5e9420d1c1bcedbba909391" + +[[package]] +name = "ppv-lite86" +version = "0.2.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77957b295656769bb8ad2b6a6b09d897d94f05c41b069aede1fcdaa675eaea04" +dependencies = [ + "zerocopy", +] + +[[package]] +name = "pretty_assertions" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "af7cee1a6c8a5b9208b3cb1061f10c0cb689087b3d8ce85fb9d2dd7a29b6ba66" +dependencies = [ + "diff", + "yansi", ] [[package]] @@ -3165,7 +3894,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5f12335488a2f3b0a83b14edad48dca9879ce89b2edd10e80237e4e852dd645e" dependencies = [ "proc-macro2", - "syn", + "syn 2.0.72", ] [[package]] @@ -3193,7 +3922,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8021cf59c8ec9c432cfc2526ac6b8aa508ecaf29cd415f271b8406c1b851c3fd" dependencies = [ "quote", - "syn", + "syn 2.0.72", ] [[package]] @@ -3210,6 +3939,15 @@ dependencies = [ "thiserror", ] +[[package]] +name = "psm" +version = "0.1.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5787f7cda34e3033a72192c018bc5883100330f362ef279a8cbccfce8bb4e874" +dependencies = [ + "cc", +] + [[package]] name = "qoi" version = "0.4.1" @@ -3425,6 +4163,26 @@ dependencies = [ "crossbeam-utils", ] +[[package]] +name = "recursive" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0786a43debb760f491b1bc0269fe5e84155353c67482b9e60d0cfb596054b43e" +dependencies = [ + "recursive-proc-macro-impl", + "stacker", +] + +[[package]] +name = "recursive-proc-macro-impl" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "76009fbe0614077fc1a2ce255e3a1881a2e3a3527097d5dc6d8212c585e7e38b" +dependencies = [ + "quote", + "syn 2.0.72", +] + [[package]] name = "redox_syscall" version = "0.2.16" @@ -3819,6 +4577,12 @@ dependencies = [ "untrusted", ] +[[package]] +name = "rustversion" +version = "1.0.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "955d28af4278de8121b7ebeb796b6a45735dc01436d898801014aced2773a3d6" + [[package]] name = "ryu" version = "1.0.18" @@ -4037,7 +4801,7 @@ checksum = "e0cd7e117be63d3c3678776753929474f3b04a43a080c744d6b0ae2a8c28e222" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.72", ] [[package]] @@ -4070,7 +4834,7 @@ checksum = "6c64451ba24fc7a6a2d60fc75dd9c83c90903b19028d4eff35e88fc1e86564e9" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.72", ] [[package]] @@ -4121,7 +4885,7 @@ dependencies = [ "darling", "proc-macro2", "quote", - "syn", + "syn 2.0.72", ] [[package]] @@ -4231,6 +4995,12 @@ dependencies = [ "quote", ] +[[package]] +name = "simdutf8" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f27f6278552951f1f2b8cf9da965d10969b2efdea95a6ec47987ab46edfe263a" + [[package]] name = "similar" version = "2.6.0" @@ -4249,6 +5019,12 @@ dependencies = [ "time", ] +[[package]] +name = "siphasher" +version = "0.3.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "38b58827f4464d87d377d175e90bf58eb00fd8716ff0a62f80356b5e61555d0d" + [[package]] name = "slab" version = "0.4.9" @@ -4283,6 +5059,17 @@ dependencies = [ "serde", ] +[[package]] +name = "smartstring" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3fb72c633efbaa2dd666986505016c32c3044395ceaf881518399d2f4127ee29" +dependencies = [ + "autocfg", + "static_assertions", + "version_check", +] + [[package]] name = "snafu" version = "0.8.4" @@ -4301,9 +5088,15 @@ dependencies = [ "heck 0.5.0", "proc-macro2", "quote", - "syn", + "syn 2.0.72", ] +[[package]] +name = "snap" +version = "1.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1b6b67fb9a61334225b5b790716f609cd58395f895b3fe8b328786812a40bc3b" + [[package]] name = "socket2" version = "0.5.7" @@ -4352,6 +5145,15 @@ dependencies = [ "unicode_categories", ] +[[package]] +name = "sqlparser" +version = "0.47.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "295e9930cd7a97e58ca2a070541a3ca502b17f5d1fa7157376d0fabd85324f25" +dependencies = [ + "log", +] + [[package]] name = "sqlx" version = "0.8.0" @@ -4418,7 +5220,7 @@ dependencies = [ "quote", "sqlx-core", "sqlx-macros-core", - "syn", + "syn 2.0.72", ] [[package]] @@ -4441,7 +5243,7 @@ dependencies = [ "sqlx-mysql", "sqlx-postgres", "sqlx-sqlite", - "syn", + "syn 2.0.72", "tempfile", "tokio", "url", @@ -4553,6 +5355,46 @@ dependencies = [ "url", ] +[[package]] +name = "stacker" +version = "0.1.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c886bd4480155fd3ef527d45e9ac8dd7118a898a46530b7b94c3e21866259fce" +dependencies = [ + "cc", + "cfg-if", + "libc", + "psm", + "winapi", +] + +[[package]] +name = "static_assertions" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f" + +[[package]] +name = "streaming-decompression" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bf6cc3b19bfb128a8ad11026086e31d3ce9ad23f8ea37354b31383a187c44cf3" +dependencies = [ + "fallible-streaming-iterator", +] + +[[package]] +name = "streaming-iterator" +version = "0.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b2231b7c3057d5e4ad0156fb3dc807d900806020c5ffa3ee6ff2c8c76fb8520" + +[[package]] +name = "strength_reduce" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fe895eb47f22e2ddd4dabc02bce419d2e643c8e3b585c78158b349195bc24d82" + [[package]] name = "strfmt" version = "0.2.4" @@ -4585,7 +5427,7 @@ dependencies = [ "proc-macro2", "quote", "structmeta-derive", - "syn", + "syn 2.0.72", ] [[package]] @@ -4596,7 +5438,26 @@ checksum = "152a0b65a590ff6c3da95cabe2353ee04e6167c896b28e3b14478c2636c922fc" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.72", +] + +[[package]] +name = "strum" +version = "0.26.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8fec0f0aef304996cf250b31b5a10dee7980c85da9d759361292b8bca5a18f06" + +[[package]] +name = "strum_macros" +version = "0.26.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4c6bee85a5a24955dc440386795aa378cd9cf82acd5f764469152d2270e581be" +dependencies = [ + "heck 0.5.0", + "proc-macro2", + "quote", + "rustversion", + "syn 2.0.72", ] [[package]] @@ -4605,6 +5466,17 @@ version = "2.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292" +[[package]] +name = "syn" +version = "1.0.109" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + [[package]] name = "syn" version = "2.0.72" @@ -4628,6 +5500,20 @@ version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a7065abeca94b6a8a577f9bd45aa0867a2238b74e8eb67cf10d492bc39351394" +[[package]] +name = "sysinfo" +version = "0.30.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0a5b4ddaee55fb2bea2bf0e5000747e5f5c0de765e5a5ff87f4cd106439f4bb3" +dependencies = [ + "cfg-if", + "core-foundation-sys", + "libc", + "ntapi", + "once_cell", + "windows", +] + [[package]] name = "system-configuration" version = "0.5.1" @@ -4662,6 +5548,12 @@ dependencies = [ "version-compare", ] +[[package]] +name = "target-features" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c1bbb9f3c5c463a01705937a24fdabc5047929ac764b2d5b9cf681c1f5041ed5" + [[package]] name = "target-lexicon" version = "0.12.16" @@ -4738,7 +5630,7 @@ checksum = "a4558b58466b9ad7ca0f102865eccc95938dca1a74a856f2b57b6629050da261" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.72", ] [[package]] @@ -4834,7 +5726,7 @@ checksum = "693d596312e88961bc67d7f1f97af8a70227d9f90c31bba5806eec004978d752" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.72", ] [[package]] @@ -5016,7 +5908,7 @@ checksum = "34704c8d6ebcbc939824180af020566b01a7c01f80641264eba0999f6c2b6be7" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.72", ] [[package]] @@ -5091,7 +5983,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "04659ddb06c87d233c566112c1c9c5b9e98256d9af50ec3bc9c8327f873a7568" dependencies = [ "quote", - "syn", + "syn 2.0.72", ] [[package]] @@ -5154,6 +6046,15 @@ version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e4259d9d4425d9f0661581b804cb85fe66a4c631cadd8f490d1c13a35d5d9291" +[[package]] +name = "unicode-reverse" +version = "1.0.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4b6f4888ebc23094adfb574fdca9fdc891826287a6397d2cd28802ffd6f20c76" +dependencies = [ + "unicode-segmentation", +] + [[package]] name = "unicode-segmentation" version = "1.11.0" @@ -5311,7 +6212,7 @@ dependencies = [ "once_cell", "proc-macro2", "quote", - "syn", + "syn 2.0.72", "wasm-bindgen-shared", ] @@ -5345,7 +6246,7 @@ checksum = "e94f17b526d0a461a191c78ea52bbce64071ed5c04c9ffe424dcb38f74171bb7" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.72", "wasm-bindgen-backend", "wasm-bindgen-shared", ] @@ -5657,6 +6558,12 @@ dependencies = [ "windows-sys 0.48.0", ] +[[package]] +name = "xxhash-rust" +version = "0.8.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6a5cbf750400958819fb6178eaa83bee5cd9c29a26a40cc241df8c70fdd46984" + [[package]] name = "yansi" version = "0.5.1" @@ -5692,7 +6599,7 @@ checksum = "fa4f8080344d4671fb4e831a13ad1e68092748387dfc4f55e356242fae12ce3e" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.72", ] [[package]] @@ -5712,7 +6619,7 @@ checksum = "ce36e65b0d2999d2aafac989fb249189a141aee1f53c612c1f37d72631959f69" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.72", ] [[package]] diff --git a/server/main-api/src/setup/database/alias.rs b/server/main-api/src/setup/database/alias.rs index adb38456a..047c1cf07 100644 --- a/server/main-api/src/setup/database/alias.rs +++ b/server/main-api/src/setup/database/alias.rs @@ -58,22 +58,28 @@ pub async fn download_updates() -> Result, crate::BoxedError> for index in 0..id_col.len() { let id = id_col.get(index).unwrap(); let r#type = type_col.get(index).unwrap(); - let visible_id = visible_id_col.get(index).unwrap(); + let visible_id = visible_id_col.get(index); + let visible_id = match visible_id { + Some(v) => v.to_string(), + None => id.to_string(), + }; aliase.push(Alias { alias: id.to_string(), key: id.to_string(), r#type: r#type.to_string(), - visible_id: visible_id.to_string(), + visible_id: visible_id.clone(), }); aliase.push(Alias { - alias: visible_id.to_string(), + alias: visible_id.clone(), key: id.to_string(), r#type: r#type.to_string(), - visible_id: visible_id.to_string(), + visible_id: visible_id.clone(), }); } let df_expanded = df.explode(["aliases"])?; + let mask = df_expanded.column("aliases")?.is_not_null(); + let df_expanded = df_expanded.filter(&mask)?; let id_col = df_expanded.column("id")?.str()?; let type_col = df_expanded.column("type")?.str()?; let visible_id_col = df_expanded.column("visible_id")?.str()?; @@ -82,12 +88,16 @@ pub async fn download_updates() -> Result, crate::BoxedError> let alias = aliases_col.get(index).unwrap(); let id = id_col.get(index).unwrap(); let r#type = type_col.get(index).unwrap(); - let visible_id = visible_id_col.get(index).unwrap(); + let visible_id = visible_id_col.get(index); + let visible_id = match visible_id { + Some(v) => v.to_string(), + None => id.to_string(), + }; aliase.push(Alias { alias: alias.to_string(), key: id.to_string(), r#type: r#type.to_string(), - visible_id: visible_id.to_string(), + visible_id, }); } Ok(LimitedVec(aliase)) diff --git a/server/main-api/src/setup/database/data.rs b/server/main-api/src/setup/database/data.rs index efc9754db..789922091 100644 --- a/server/main-api/src/setup/database/data.rs +++ b/server/main-api/src/setup/database/data.rs @@ -150,7 +150,7 @@ pub(super) async fn load_all_to_db( Ok(()) } #[tracing::instrument] -pub async fn download_status() -> Result<(LimitedVec,LimitedVec), crate::BoxedError> { +pub async fn download_status() -> Result<(LimitedVec, LimitedVec), crate::BoxedError> { let cdn_url = std::env::var("CDN_URL").unwrap_or_else(|_| "https://nav.tum.de/cdn".to_string()); let body = reqwest::get(format!("{cdn_url}/status_data.parquet")) .await? @@ -161,7 +161,10 @@ pub async fn download_status() -> Result<(LimitedVec,LimitedVec), c file.write_all(&body)?; let df = ParquetReader::new(&mut file).finish().unwrap(); let id_col = Vec::from(df.column("id")?.str()?); - let id_col=id_col.into_iter().filter_map(|s| s.map(String::from)).collect(); + let id_col = id_col + .into_iter() + .filter_map(|s| s.map(String::from)) + .collect(); let hash_col = Vec::from(df.column("hash")?.i64()?); let hash_col = hash_col.into_iter().flatten().collect(); Ok((LimitedVec(id_col), LimitedVec(hash_col))) diff --git a/server/main-api/src/setup/database/mod.rs b/server/main-api/src/setup/database/mod.rs index 9f18d4439..aa763b4e1 100644 --- a/server/main-api/src/setup/database/mod.rs +++ b/server/main-api/src/setup/database/mod.rs @@ -14,7 +14,7 @@ pub async fn setup(pool: &sqlx::PgPool) -> Result<(), crate::BoxedError> { } #[tracing::instrument(skip(pool))] pub async fn load_data(pool: &sqlx::PgPool) -> Result<(), crate::BoxedError> { - let (new_keys,new_hashes) = data::download_status().await?; + let (new_keys, new_hashes) = data::download_status().await?; { let _ = info_span!("deleting old data").enter(); let mut tx = pool.begin().await?; From e10b7443e3a176b5c20b012474136066d18a0370 Mon Sep 17 00:00:00 2001 From: Frank Elsinga Date: Sun, 4 Aug 2024 03:00:34 +0200 Subject: [PATCH 5/5] updated the docs --- README.md | 15 ++++------ data/README.md | 75 +++++++++++++++++++++++++++++++++----------------- 2 files changed, 56 insertions(+), 34 deletions(-) diff --git a/README.md b/README.md index b5cffd4ed..812f98d33 100644 --- a/README.md +++ b/README.md @@ -64,12 +64,7 @@ cd Navigatum ### Data Processing In case you do not want to work on the data processing, you can instead -download the latest compiled files: - -```bash -wget -P data/output https://nav.tum.de/cdn/api_data.json -wget -P data/output https://nav.tum.de/cdn/search_data.json -``` +download the latest compiled files via running the server. Else you can follow the steps in the [data documentation](data/README.md). @@ -84,9 +79,11 @@ docker compose -f docker-compose.local.yml up --build ``` > [!NOTE] -> While most of the setup is simple, we need to download data (only Oberbayern is needed) for the initial setup. This takes 1-2 minutes. -> Please first bring up a [postgis](https://postgis.net/) instance (for example via `docker compose -f docker-compose.local.yml up --build`) and then run: -> +> While most of the setup is simple, we need to download data (only Oberbayern is needed) for the initial setup. This +> takes 1-2 minutes. +> Please first bring up a [postgis](https://postgis.net/) instance (for example +> via `docker compose -f docker-compose.local.yml up --build`) and then run: +> > ```bash > wget -O data.pbf https://download.geofabrik.de/europe/germany/bayern/oberbayern-latest.osm.pbf > docker run -it -v $(pwd):/data -e PGPASSWORD=CHANGE_ME --network="host" iboates/osm2pgsql:latest osm2pgsql --create --slim --database postgres --user postgres --host 127.0.0.1 --port 5432 /data/data.pbf --hstore --hstore-add-index --hstore-column raw diff --git a/data/README.md b/data/README.md index 939099caa..b57606d4e 100644 --- a/data/README.md +++ b/data/README.md @@ -9,16 +9,21 @@ This folder contains: The code to retrieve external data, as well as externally retrieved data is located under `external`. > [!WARNING] -> A lot of this code is more a work in progress than finished. Especially features such as POIs, custom maps or other data types such as events are drafted but not yet fully implemented. +> A lot of this code is more a work-in-progress than finished. +> Especially features such as POIs, custom maps or other data types such as events are drafted but not yet fully implemented. > -> New external data might break the scripts from time to time, as either rooms or buildings are removed, the external data has errors or we make assumptions here that turn out to be wrong. +> New external data might break the scripts from time to time, +> - as either rooms or buildings are removed, +> - the external data has errors, +> - or we make assumptions here that turn out to be wrong. ## Getting started ### Prerequisites -For getting started, there are some system dependencys which you will need. -Please follow the [system dependencys docs](/resources/documentation/Dependencys.md) before trying to run this part of our project. +For getting started, there are some system dependencies which you will need. +Please follow the [system dependencies docs](/resources/documentation/Dependencys.md) before trying to run this part of +our project. ### Dependencies @@ -63,7 +68,8 @@ python3 tumonline.py python3 compile.py ``` -The exported datasets will be stored in `output/` as JSON files. +The exported datasets will be stored in `output/` +as [JSON](https://www.json.org/json-de.html)/[Parquet](https://wikipedia.org/wiki/Apache_Parquet) files. ### Directory structure @@ -92,18 +98,33 @@ data ```json { - "entry-id": { - "id": "entry-id", - "type": "room", - ... data as specified in `data-format.yaml` - }, - ... all other entries in the same form + "entry-id": { + "id": "entry-id", + "type": "room", + ... + data + as + specified + in + ` + data-format.yaml + ` + }, + ... + all + other + entries + in + the + same + form } ``` ## Compilation process -The data compilation is made of indiviual processing steps, where each step adds new or modifies the current data. The basic structure of the data however stays the same from the beginning on and is specified in `data-format_*.yaml`. +The data compilation is made of indiviual processing steps, where each step adds new or modifies the current data. The +basic structure of the data however stays the same from the beginning on and is specified in `data-format_*.yaml`. - **Step 00**: The first step reads the base root node, areas, buildings etc. from the `sources/00_areatree` file and creates an object collection (python dictionary) @@ -111,18 +132,18 @@ The data compilation is made of indiviual processing steps, where each step adds - **Steps 01-29**: Within these steps, new rooms or POIs might be added, however no new areas or buildings, since all areas and buildings have to be defined in the _areatree_. After them, no new entries are being added to the data. - - **Steps 0x**: Supplement the base data with extended custom data. - - **Steps 1x**: Import rooms and building information from external sources - - **Steps 2x**: Import POIs + - **Steps 0x**: Supplement the base data with extended custom data. + - **Steps 1x**: Import rooms and building information from external sources + - **Steps 2x**: Import POIs - **Steps 30-89**: Later steps are intended to augment the entries with even more information and to ensure a consistent format. After them, no new (external or custom) information should be added to the data. - - **Steps 3x**: Make data more coherent & structural stuff - - **Steps 4x**: Coordinates and maps - - **Steps 5x**: Add images - - **Steps 6x**: - - - **Steps 7x**: - - - **Steps 8x**: Generate properties and sections (such as overview sections) + - **Steps 3x**: Make data more coherent & structural stuff + - **Steps 4x**: Coordinates and maps + - **Steps 5x**: Add images + - **Steps 6x**: - + - **Steps 7x**: - + - **Steps 8x**: Generate properties and sections (such as overview sections) - **Steps 90-99**: Process and export for search. - **Step 100**: Export final data (for use in the API). Some temporary data fields might be removed at this point. @@ -136,12 +157,16 @@ Details about the formatting are given at the head of the file. ## License -The source data (i.e. all files located in `sources/` that are not images) is made available under the Open Database License: . -Any rights in individual contents of the database are licensed under the Database Contents License: . +The source data (i.e. all files located in `sources/` that are not images) is made available under the Open Database +License: . +Any rights in individual contents of the database are licensed under the Database Contents +License: . > [!WARNING] -> The images in `sources/img/` are subject to their own licensing terms, which are stated in the file `sources/img/img-sources.yaml`. -> The compiled database may contain contents from external sources (i.e. all files in `external/`) that do have different license terms. +> The images in `sources/img/` are subject to their own licensing terms, which are stated in the +> file `sources/img/img-sources.yaml`. +> The compiled database may contain contents from external sources (i.e. all files in `external/`) that do have +> different license terms. ---