document main viz 3 datasets #34

medialab · Oct 11, 2021 · ef126db · ef126db
1 parent 841d13d
commit ef126db
Show file tree

Hide file tree

Showing 18 changed files with 513 additions and 107 deletions.
diff --git a/datascripts/part_1.py b/datascripts/part_1.py
@@ -78,9 +78,9 @@ def output_row(region, year, region_trade, region_products, total_trade):
 `decline_longitudinal_data.csv` documentation
 ===
 
-# What is the data ? 
+# What is the original data ? 
 
-toflit18 flows
+toflit18 flows from "bdd_base_courante" file
 
 # What does a line correspond to ?
 
@@ -131,9 +131,9 @@ def output_row(region, year, region_trade, region_products, total_trade):
 `decline_LR_products.csv` documentation
 ===
 
-# What is the data ? 
+# What is the original data ? 
 
-toflit18 flows
+toflit18 flows from "bdd_base_courante" file
 
 # What does a line correspond to ?
 
@@ -169,9 +169,9 @@ def output_row(region, year, region_trade, region_products, total_trade):
 `decline_LR_partners.csv` documentation
 ===
 
-# What is the data ? 
+# What is the original data ? 
 
-toflit18 flows
+toflit18 flows from "bdd_base_courante" file
 
 # What does a line correspond to ?
 

diff --git a/datascripts/part_2_navigo.py b/datascripts/part_2_navigo.py
@@ -125,7 +125,7 @@ def format_for_viz(f):
 `part_2_navigo_viz_data.csv` documentation
 ===
 
-# What is the data ? 
+# What is the original data ? 
 
 Navigo flows for 1789
 
@@ -148,7 +148,7 @@ def format_for_viz(f):
 - the "bureau des fermes" associated to the travel departure is modified/cleaned on the go in the datascript (see `datascripts/part_2_navigo.py`), this could be resolved upstream at some point.
   """
 ensure_dir("../public/data/part_2_navigo_viz_data/")
-write_readme('part2_navigo_viz_data/README.md', info)
+write_readme('part_2_navigo_viz_data/README.md', info)
 destination_filepath = "../public/data/part_2_navigo_viz_data/part_2_navigo_viz_data.csv"
 with open(destination_filepath, "w", newline='') as csvfile:
   logger.info('start | part 2 main viz navigo data')

diff --git a/datascripts/part_2_toflit18.py b/datascripts/part_2_toflit18.py
@@ -1,7 +1,6 @@
 import csv
 from collections import defaultdict
-import os
-from lib import ensure_dir, logger
+from lib import ensure_dir, logger, write_readme
 
 
 logger.info('start | part 2 main viz toflit18 data')
@@ -25,7 +24,7 @@
 }
 for f in relevant_flows :
     product_weight_kg = 0
-    # @todo a lot of products are flushed out when doing thing
+    # @todo a lot of products are flushed out when doing the following
     if f['quantity_unit_metric'] and f['quantity_unit_metric'] == 'kg':
       product_weight_kg = float(f['quantities_metric'] if f['quantities_metric'] else 0)
     f['product_weight_kg'] = product_weight_kg
@@ -40,7 +39,6 @@
         colonies_products[flow_type][product] = int(colonies_products[flow_type][product]) + int(value) if product in colonies_products[flow_type] else int(value)
 
     if product in ['Café', 'Sucre', 'Indigo', 'Coton non transformé']:
-        # product_viz = "produit colonial ('Café', 'Sucre', 'Indigo', 'Coton non transformé')"
         product_viz = "produits coloniaux"
     elif (product == 'Sel'):
         product_viz = 'sel'
@@ -102,9 +100,42 @@ def format_for_viz(f):
     """
 flows_viz = list(uniques.values())
 
-# write dataset
+# write and document dataset
+info = """
+`part_2_toflit_viz_data.csv` documentation
+===
+
+# What is the original data ? 
+
+toflit18 flows from "bdd_base_courante" file
+
+# What does a line correspond to ?
+
+An aggregation of toflit18 flows for 1789, corresponding to :
+
+- 1 bureau des fermes in particular
+- 1 class of partner in particular
+- 1 type of product in particular
+
+# Filters
+
+- source "Best Guess customs region prod x partner" (best_guess_region_prodxpart == 1)
+- year : 1789
+- customs_region : La Rochelle
+
+# Aggregation/computation info
+
+- values aggregated by cumulated value in livre tournois
+- partner column is made from a custom classification to see directly in the datascript `datascripts/part_2_toflit18.py`
+- product column is made from a custom classification to see directly in the datascript `datascripts/part_2_toflit18.py`
+
+# Notes/warning
+
+- Products weights are quite rarely specified in flows
+  """
 dataset_filepath = "../public/data/part_2_toflit_viz_data/part_2_toflit_viz_data.csv"
 ensure_dir("../public/data/part_2_toflit_viz_data/")
+write_readme("part_2_toflit_viz_data/README.md", info)
 with open(dataset_filepath, "w") as csvfile:
   fieldnames = flows_viz[0].keys()
   writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

diff --git a/datascripts/part_3.py b/datascripts/part_3.py
@@ -1,7 +1,8 @@
 """
+@author : Cécile Asselin
 DOCUMENTATION DE CE SCRIPT 
 
-Ce qu'il fait ? ==> Ecriture de csv de données pour nourrir les visualisations
+Ce qu'il fait ? ==> Ecriture de csv de données pour nourrir les visualisations de la partie 3 du site
 
 (csv 0 de localisation de tous les ports de la DFLR dans leurs bureaux_map, provinces, amirautés, directions ... pour nourrir les cartes de l'intro)
 pour chaque port DFLR en 1789 :
@@ -54,10 +55,8 @@
 
 import networkx as nx
 import csv
-from operator import itemgetter
 from random import random
-import os
-from lib import ensure_dir, logger
+from lib import ensure_dir, logger, write_readme
 
 logger.info('start | part 3 main viz datasets')
 
@@ -101,7 +100,7 @@ def correct_localities_alignment(port):
 # ports_location_data_filepath : aims to feed intro maps (ports located in their bureaux, and provinces, and admiralties)
 ports_location_data_filepath = "../public/data/ports_locations_data/ports_locations_data.csv"
 # ports_tonnages_part3_data_filepath : aims to feed step 1 of the main viz of part3
-ports_tonnages_part3_data_filepath = "../public/data/part_3_step1_viz_data/part_3_step1_viz_data.csv"
+part_3_step1_viz_data_filepath = "../public/data/part_3_step1_viz_data/part_3_step1_viz_data.csv"
 ensure_dir("../public/data/ports_locations_data/")
 ensure_dir("../public/data/part_3_step1_viz_data/")
 
@@ -153,7 +152,35 @@ def correct_localities_alignment(port):
 for port, values in ports.items():
     values['mean_tonnage'] = values['cumulated_tonnage'] / values['nb_pointcalls_out'] if values['nb_pointcalls_out'] != 0 else 0
 
-# write datasets
+# write and document datasets
+info = """
+`ports_location_data.csv` documentation
+===
+
+# What is the original data ? 
+
+Navigo pointcalls from pointcalls API endpoint
+
+# What does a line correspond to ?
+
+A specific port from PASA and its related general informations.
+
+# Filters
+
+- year : 1789
+- pointcall_function : 'O'
+- ferme_direction : 'La Rochelle'
+- poincall_action: 'Out'
+
+# Aggregation/computation info
+
+/
+
+# Notes/warning
+
+/
+"""
+write_readme("ports_location_data/README.md", info)
 with open(ports_location_data_filepath, "w", newline='') as csvfile:
     fieldnames = ['port', 'latitude', 'longitude',
         'customs_office', 'province', 'admiralty', 'customs_region']
@@ -172,7 +199,35 @@ def correct_localities_alignment(port):
         })
 
 # ports_tonnages_part3_data_filepath : aims to feed step 1 of the main viz of part3
-with open(ports_tonnages_part3_data_filepath, "w", newline='') as csvfile:
+info = """
+`part_3_step1_viz_data.csv` documentation
+===
+
+# What is the original data ? 
+
+Navigo pointcalls from pointcalls API endpoint
+
+# What does a line correspond to ?
+
+A specific port from PASA and its related tonnage and travels data for 1789.
+
+# Filters
+
+- year : 1789
+- pointcall_function : 'O'
+- ferme_direction : 'La Rochelle'
+- poincall_action: 'Out'
+
+# Aggregation/computation info
+
+/
+
+# Notes/warning
+
+/
+"""
+write_readme("part_3_step1_viz_data/README.md", info)
+with open(part_3_step1_viz_data_filepath, "w", newline='') as csvfile:
     fieldnames = ['port', 'nb_pointcalls_out', 'mean_tonnage',
         'cumulated_tonnage', 'latitude', 'longitude', 'customs_office']
     writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
@@ -327,7 +382,37 @@ def build_relevant_toflit_flows():
                 bureaux_map[bureau]['cumulated_exports_value_from_ext'] += value
     bureaux_map[bureau]['nb_toflit_flows_taken_into_account'] += 1 
 
-# write dataset
+# write and document dataset
+info = """
+`part_3_step3_viz_ports_data.csv` documentation
+===
+
+# What is the original data ? 
+
+Navigo flows from raw flows' API endpoint
+
+# What does a line correspond to ?
+
+A specific port and its related data for 1787 or 1789 depending on the port.
+
+# Filters
+
+- for all flows : only flows coming out of the studied port (vs flows entering the port which are not taken into account)
+- for La Rochelle Data : 
+  - year : 1789
+  - pointcall_function : 'O'
+- for Bordeaux, Nantes and Le Havre data :
+  - year : 1787
+
+# Aggregation/computation info
+
+- we distinguish tonnages for travels inside PASA, and travels outside PASA
+
+# Notes/warning
+
+- we use different years depending on data availability for PASA region and compared ports
+"""
+write_readme("part_3_step3_viz_ports_data/README.md", info)
 with open(part_3_step3_viz_ports_data_filepath, 'w', newline='') as csvfile1:
     fieldnames1 = ['type_of_object', 'name', 'cumulated_tonnage_in_region', 'cumulated_tonnage_out_region', 'nb_navigo_flows_taken_into_account', 'customs_office', 'customs_region', 'latitude', 'longitude']
     writer1 = csv.DictWriter(csvfile1, fieldnames=fieldnames1)
@@ -348,9 +433,53 @@ def build_relevant_toflit_flows():
             direction = 'Bordeaux'
         else:
             direction = 'La Rochelle'
-        writer1.writerow({'type_of_object': 'port', 'name': values['port'], 'cumulated_tonnage_in_region': values['cumulated_tonnage_in_region'], 'cumulated_tonnage_out_region': values['cumulated_tonnage_out_region'], 'nb_navigo_flows_taken_into_account': values['nb_navigo_flows_taken_into_account'], 'customs_office': bureau, 'customs_region': direction, 'latitude': values['latitude'] if 'latitude' in values.keys() else 'ERROR', 'longitude': values['longitude'] if 'longitude' in values.keys() else 'ERROR'})     
+        writer1.writerow({
+          'type_of_object': 'port', 
+          'name': values['port'], 
+          'cumulated_tonnage_in_region': values['cumulated_tonnage_in_region'], 
+          'cumulated_tonnage_out_region': values['cumulated_tonnage_out_region'], 
+          'nb_navigo_flows_taken_into_account': values['nb_navigo_flows_taken_into_account'], 
+          'customs_office': bureau, 
+          'customs_region': direction, 
+          'latitude': values['latitude'] if 'latitude' in values.keys() else 'ERROR', 
+          'longitude': values['longitude'] if 'longitude' in values.keys() else 'ERROR'
+        })     
 
+info = """
+`part_3_step3_viz_customs_offices_data.csv` documentation
+===
+
+# What is the original data ? 
+
+- Toflit18 flows taken from `bdd_base_courante.csv` dataset.
+- Navigation data from Navigo `raw_flows` endpoint
+
+# What does a line correspond to ?
+
+Navigation data for 1789 related to a specific *bureau des fermes* (thanks to bureaux-ports alignments)
 
+# Filters
+
+- for Toflit18 :
+  - year : 1789
+  - type of flow : exports
+  - targeted customs offices : ['La Rochelle', 'Marennes', 'Rochefort', 'Saint-Martin-de-Ré', "Les Sables d'Olonne", "Tonnay-Charente", 'Aligre', 'Charente', 'Le Havre', 'Bordeaux', 'Nantes']
+- for navigo data : 
+  - year : 1789
+  - pointcall_function : 'O'
+  - only data from ports attached to PASA's customs offices/bureaux des fermes
+
+
+# Aggregation/computation info
+
+- we distinguish navigo's tonnages for travels inside PASA, and travels outside PASA grounding on flows' destination
+- we distinguish toflit18's exports of PASA products and exports of not-PASA products, grounding on the flows "origin" field
+
+# Notes/warning
+
+/
+"""
+write_readme("part_3_step3_viz_customs_offices_data/README.md", info)
 with open(part_3_step3_viz_customs_offices_data_filepath, 'w', newline='') as csvfile2:
     fieldnames2 = ['type_of_object', 'name', 'cumulated_tonnage_in_region', 'cumulated_tonnage_out_region', 'nb_navigo_flows_taken_into_account', 'cumulated_exports_value_from_region', 'cumulated_exports_value_from_ext', 'nb_toflit_flows_taken_into_account','customs_office', 'customs_region', 'latitude', 'longitude']
     writer2 = csv.DictWriter(csvfile2, fieldnames=fieldnames2)
@@ -464,8 +593,31 @@ def add_edge(g, source, target, tonnage):
     else:
       for id in graph.nodes():
         graph.nodes[id]["degree"] = graph.degree(id)
+    info = """
+`{}.gexf` documentation
+===
+
+# What is the original data ? 
+
+Navigo flows from API "raw flows" endpoint
+
+# Filters
+
+- year : 1787
+- we filter to flows that contain a departure OR destination in the following admiralties : {}
+
+# Aggregation/computation info
+
+- nodes `degree` is computed for each node
+- nodes `weight` represents the number of flows
+
+# Notes/warning
+
+/
+""".format(name, ', '.join(list(admiralties)))
     ensure_dir('../public/data/' + name + '/')
-    nx.write_gexf(graph, '../public/data/' + name + '/' + name + '.gexf')  
+    nx.write_gexf(graph, '../public/data/' + name + '/' + name + '.gexf') 
+    write_readme(name + "/" + "README.md", info) 
     return graph
 
 def build_centrality_metrics(flows):
@@ -502,14 +654,40 @@ def build_centrality_metrics(flows):
           "metrics_type": "PageRank",
           "score": page_rank[port],
       })
-      # page_ranks += [{"group": port, "port": p, "page_rank": value} for (p, value) in page_rank.items() if graph.nodes[p]['internal'] == True]
       metrics.append({
           "port": port,
           "metrics_type": "betweenness centrality",
           "score": betweenness_centrality[port]
       })
-      # betweenness_centralities += [{"group": port, "port": p, "betweenness_centrality": value} for (p, value) in betweenness_centrality.items()if graph.nodes[p]['internal'] == True]
+  info = """
+`part_3_centralite_comparaison.csv` documentation
+===
+
+# What is the original data ? 
+
+Navigo flows from API "raw flows" endpoint
+
+# What does a line correspond to ?
+
+A given centrality metric for the network of ports attached to a given admiralty or group of admiralty (for La Rochelle we include "La Rochelle", "Marennes" and "Sables d'Olonnes" admiralties).
+
+# Filters
+
+- year : 1787
+- we filter to flows that contain a departure OR destination in the following admiralties : "Bordeaux", "Nantes", "La Rochelle", "Marennes" and "Sables d'Olonnes"
+- each network is computed separately
+
+# Aggregation/computation info
+
+- the metric regards the main port of each network
+
+# Notes/warning
+
+/
+  """
   ensure_dir('../public/data/part_3_centralite_comparaison/')
+  write_readme('part_3_centralite_comparaison/README.md', info)
+
   with open('../public/data/part_3_centralite_comparaison/part_3_centralite_comparaison.csv', 'w', newline='') as csvfile:
     fieldnames = metrics[0].keys()
     writer = csv.DictWriter(csvfile, fieldnames=fieldnames)