Skip to content

Commit

Permalink
Dev (#6)
Browse files Browse the repository at this point in the history
* Add primary key to releases

* Add cli bool flag to write sql indexes

* Remove filler id for labels and artists

* Add masters file parser

* Add ci/cd test for masters file parser

* Add info about indexes

* Formatting and refactoring
  • Loading branch information
DylanBartels authored Mar 22, 2022
1 parent fafdf0a commit b7d9527
Show file tree
Hide file tree
Showing 17 changed files with 867 additions and 145 deletions.
7 changes: 5 additions & 2 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,10 @@ jobs:
run: cargo run --bin discogs-load discogs-load/test_data/releases.xml.gz

- name: Run labels
run: cargo run --bin discogs-load discogs-load/test_data/releases.xml.gz
run: cargo run --bin discogs-load discogs-load/test_data/labels.xml.gz

- name: Run artists
run: cargo run --bin discogs-load discogs-load/test_data/releases.xml.gz
run: cargo run --bin discogs-load discogs-load/test_data/artists.xml.gz

- name: Run masters
run: cargo run --bin discogs-load discogs-load/test_data/masters.xml.gz
24 changes: 15 additions & 9 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,9 @@ USAGE:
discogs-load [OPTIONS] [FILE(S)]...

FLAGS:
-h, --help Prints help information
-V, --version Prints version information
--create-indexes Creates indexes
-h, --help Prints help information
-V, --version Prints version information

OPTIONS:
--batch-size <batch-size> Number of rows per insert [default: 10000]
Expand All @@ -47,13 +48,23 @@ ARGS:

## Usage

Download the releases data dump [here](http://www.discogs.com/data/), and run the binary with the path to the gz compressed file as only argument. For the example below we'll use a dockerized postgres instance.
Download the releases data dump [here](http://www.discogs.com/data/), and run the binary with the path to the gz compressed file(s) as only argument. For the example below we'll use a dockerized postgres instance.

```
docker-compose up -d postgres
./discogs-load-aarch64-apple-darwin discogs_20211201_releases.xml.gz discogs_20220201_labels.xml.gz
```

It is possible to afterwards run the innitalization of the project defined indexes.

```
./discogs-load-aarch64-apple-darwin --create-indexes
```

## Datamodel

![Datamodel](imgs/datamodel.png)

## Tests

If you don't want to run the huge releases file, it is possible to run a smaller example file like so:
Expand All @@ -69,9 +80,4 @@ And do a small manual test:
docker exec -it discogs-load-postgres-1 /bin/bash
psql -U dev discogs
select * from release;
```

## Contributing/Remaining todo

- Create a parser for the masters dataset
- Create a proper relational database schema
```
10 changes: 5 additions & 5 deletions discogs-load/src/artist.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ use crate::parser::Parser;

#[derive(Clone, Debug)]
pub struct Artist {
pub id: i32,
pub name: String,
pub real_name: String,
pub profile: String,
Expand All @@ -21,6 +22,7 @@ pub struct Artist {
impl SqlSerialization for Artist {
fn to_sql(&self) -> Vec<&'_ (dyn ToSql + Sync)> {
let row: Vec<&'_ (dyn ToSql + Sync)> = vec![
&self.id,
&self.name,
&self.real_name,
&self.profile,
Expand All @@ -37,6 +39,7 @@ impl SqlSerialization for Artist {
impl Artist {
pub fn new() -> Self {
Artist {
id: 0,
name: String::new(),
real_name: String::new(),
profile: String::new(),
Expand Down Expand Up @@ -70,7 +73,6 @@ pub struct ArtistsParser<'a> {
state: ParserState,
artists: HashMap<i32, Artist>,
current_artist: Artist,
current_id: i32,
pb: ProgressBar,
db_opts: &'a DbOpt,
}
Expand All @@ -81,7 +83,6 @@ impl<'a> ArtistsParser<'a> {
state: ParserState::Artist,
artists: HashMap::new(),
current_artist: Artist::new(),
current_id: 0,
pb: ProgressBar::new(7993954),
db_opts,
}
Expand All @@ -94,7 +95,6 @@ impl<'a> Parser<'a> for ArtistsParser<'a> {
state: ParserState::Artist,
artists: HashMap::new(),
current_artist: Artist::new(),
current_id: 0,
pb: ProgressBar::new(7993954),
db_opts,
}
Expand Down Expand Up @@ -126,7 +126,7 @@ impl<'a> Parser<'a> for ArtistsParser<'a> {

Event::End(e) if e.local_name() == b"artist" => {
self.artists
.entry(self.current_id)
.entry(self.current_artist.id)
.or_insert(self.current_artist.clone());
if self.artists.len() >= self.db_opts.batch_size {
// use drain? https://doc.rust-lang.org/std/collections/struct.HashMap.html#examples-13
Expand All @@ -149,7 +149,7 @@ impl<'a> Parser<'a> for ArtistsParser<'a> {

ParserState::Id => match ev {
Event::Text(e) => {
self.current_id = str::parse(str::from_utf8(&e.unescaped()?)?)?;
self.current_artist.id = str::parse(str::from_utf8(&e.unescaped()?)?)?;
ParserState::Id
}

Expand Down
95 changes: 72 additions & 23 deletions discogs-load/src/db.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,14 @@ use structopt::StructOpt;

use crate::artist::Artist;
use crate::label::Label;
use crate::master::{Master, MasterArtist};
use crate::release::{Release, ReleaseLabel, ReleaseVideo};

#[derive(Debug, Clone, StructOpt)]
pub struct DbOpt {
/// Creates indexes
#[structopt(long = "create-indexes")]
pub create_indexes: bool,
/// Number of rows per insert
#[structopt(long = "batch-size", default_value = "10000")]
pub batch_size: usize,
Expand All @@ -34,17 +38,19 @@ pub trait SqlSerialization {

/// Initialize schema and close connection.
pub fn init(db_opts: &DbOpt, schema_path: &str) -> Result<()> {
info!("Creating the tables.");
let db = Db::connect(db_opts);
Db::create_schema(&mut db?, schema_path)?;
Db::execute_file(&mut db?, schema_path)?;
Ok(())
}

// /// Initialize indexes and close connection.
// pub fn indexes(opts: &DbOpt) -> Result<()> {
// let db = Db::connect(opts);
// Db::create_indexes(&mut db?)?;
// Ok(())
// }
/// Initialize indexes and close connection.
pub fn indexes(opts: &DbOpt, file_path: &str) -> Result<()> {
info!("Creating the indexes.");
let db = Db::connect(opts);
Db::execute_file(&mut db?, file_path)?;
Ok(())
}

pub fn write_releases(
db_opts: &DbOpt,
Expand All @@ -71,6 +77,17 @@ pub fn write_artists(db_opts: &DbOpt, artists: &HashMap<i32, Artist>) -> Result<
Ok(())
}

pub fn write_masters(
db_opts: &DbOpt,
masters: &HashMap<i32, Master>,
masters_artists: &HashMap<i32, MasterArtist>,
) -> Result<()> {
let mut db = Db::connect(db_opts)?;
Db::write_master_rows(&mut db, masters)?;
Db::write_master_artists_rows(&mut db, masters_artists)?;
Ok(())
}

struct Db {
db_client: Client,
}
Expand All @@ -89,12 +106,13 @@ impl Db {
fn write_release_rows(&mut self, data: &HashMap<i32, Release>) -> Result<()> {
let insert = InsertCommand::new(
"release",
"(status, title, country, released, notes, genres, styles, master_id, data_quality)",
"(id, status, title, country, released, notes, genres, styles, master_id, data_quality)",
)?;
insert.execute(
&mut self.db_client,
data,
&[
Type::INT4,
Type::TEXT,
Type::TEXT,
Type::TEXT,
Expand All @@ -110,30 +128,35 @@ impl Db {
}

fn write_release_labels_rows(&mut self, data: &HashMap<i32, ReleaseLabel>) -> Result<()> {
let insert = InsertCommand::new("release_label", "(label, catno)")?;
insert.execute(&mut self.db_client, data, &[Type::TEXT, Type::TEXT])?;
let insert = InsertCommand::new("release_label", "(release_id, label, catno, label_id)")?;
insert.execute(
&mut self.db_client,
data,
&[Type::INT4, Type::TEXT, Type::TEXT, Type::INT4],
)?;
Ok(())
}

fn write_release_videos_rows(&mut self, data: &HashMap<i32, ReleaseVideo>) -> Result<()> {
let insert = InsertCommand::new("release_video", "(duration, src, title)")?;
let insert = InsertCommand::new("release_video", "(release_id, duration, src, title)")?;
insert.execute(
&mut self.db_client,
data,
&[Type::INT4, Type::TEXT, Type::TEXT],
&[Type::INT4, Type::INT4, Type::TEXT, Type::TEXT],
)?;
Ok(())
}

fn write_label_rows(&mut self, data: &HashMap<i32, Label>) -> Result<()> {
let insert = InsertCommand::new(
"label",
"(name, contactinfo, profile, parent_label, sublabels, urls, data_quality)",
"(id, name, contactinfo, profile, parent_label, sublabels, urls, data_quality)",
)?;
insert.execute(
&mut self.db_client,
data,
&[
Type::INT4,
Type::TEXT,
Type::TEXT,
Type::TEXT,
Expand All @@ -149,12 +172,13 @@ impl Db {
fn write_artist_rows(&mut self, data: &HashMap<i32, Artist>) -> Result<()> {
let insert = InsertCommand::new(
"artist",
"(name, real_name, profile, data_quality, name_variations, urls, aliases, members)",
"(id, name, real_name, profile, data_quality, name_variations, urls, aliases, members)",
)?;
insert.execute(
&mut self.db_client,
data,
&[
Type::INT4,
Type::TEXT,
Type::TEXT,
Type::TEXT,
Expand All @@ -168,19 +192,44 @@ impl Db {
Ok(())
}

fn create_schema(&mut self, schema_path: &str) -> Result<()> {
info!("Creating the tables.");
fn write_master_rows(&mut self, data: &HashMap<i32, Master>) -> Result<()> {
let insert = InsertCommand::new(
"master",
"(id, title, release_id, year, notes, genres, styles, data_quality)",
)?;
insert.execute(
&mut self.db_client,
data,
&[
Type::INT4,
Type::TEXT,
Type::INT4,
Type::INT4,
Type::TEXT,
Type::TEXT_ARRAY,
Type::TEXT_ARRAY,
Type::TEXT,
],
)?;
Ok(())
}

fn write_master_artists_rows(&mut self, data: &HashMap<i32, MasterArtist>) -> Result<()> {
let insert =
InsertCommand::new("master_artist", "(artist_id, master_id, name, anv, role)")?;
insert.execute(
&mut self.db_client,
data,
&[Type::INT4, Type::INT4, Type::TEXT, Type::TEXT, Type::TEXT],
)?;
Ok(())
}

fn execute_file(&mut self, schema_path: &str) -> Result<()> {
let tables_structure = fs::read_to_string(schema_path).unwrap();
self.db_client.batch_execute(&tables_structure).unwrap();
Ok(())
}

// fn create_indexes(&mut self) -> Result<()> {
// info!("Creating the indexes.");
// let tables_structure = fs::read_to_string("sql/indexes/release.sql").unwrap();
// self.db_client.batch_execute(&tables_structure).unwrap();
// Ok(())
// }
}

struct InsertCommand {
Expand Down
Loading

0 comments on commit b7d9527

Please sign in to comment.