Skip to content

Populando dados CSV no repositório GIT

Peter edited this page May 17, 2019 · 4 revisions

Lembretes:

  • atualizar https://github.com/datasets-br/city-codes com novos dados do IBGE e da Wikidata
  • reusar convenções do Stable, por exemplo nomes de cidade por CamelCase.
  • IBGE, carregar integral, com algoritmo de scan files, etc. Agro2017 e endereços censo2010, ou direto do portal endereços.

Transformando dados do osm_planet_br em relatórios de CEP.

CREATE INDEX idx_planet_osm_point_tags_cep ON planet_osm_point ( (tags ->>'addr:postcode') );
-- acrescer idx_ref para citar OSM, IBGE, etc. junto ao gtype.
CREATE TABLE kx_br_ceps_from_osm AS
 select 'n' gtype, c.uf, replace(c.lexlabel,'.','_')  as municip, 
       pt.tags->>'addr:postcode' as cep,
       count(*) n
 from public.vw_osm_city_polygon c INNER JOIN planet_osm_point pt 
     ON st_contains(c.way,pt.way)
 where pt.tags?'addr:postcode'
 GROUP BY 1,2,3,4
;
INSERT INTO kx_br_ceps_from_osm(gtype,uf,municip,cep,n)
 select 'w' gtype, c.uf, replace(c.lexlabel,'.','_')  as municip, 
       pt.tags->>'addr:postcode' as cep,
       count(*) n
 from public.vw_osm_city_polygon c INNER JOIN planet_osm_line pt 
     ON st_contains(c.way,pt.way)
 where pt.tags?'addr:postcode'
 GROUP BY 1,2,3,4
;
INSERT INTO kx_br_ceps_from_osm(gtype,uf,municip,cep,n)
 select 'r' gtype, c.uf, replace(c .lexlabel,'.','_')  as municip, 
       pt.tags->>'addr:postcode' as cep,
       count(*) n
 from public.vw_osm_city_polygon c INNER JOIN planet_osm_polygon pt 
     ON c.way && pt.way AND st_contains(c.way,pt.way)
 where pt.tags?'addr:postcode'
 GROUP BY 1,2,3,4
;

--- for all, IMPORTANT:
UPDATE kx_br_ceps_from_osm set cep= replace(replace(cep,'-',''),'.',''); -- normalize
DELETE FROM kx_br_ceps_from_osm WHERE cep!~ '^\d{8,8}$'; -- ~100

-- report:
SELECT count(distinct cep),count(*) from kx_br_ceps_from_osm;  -- 20355 | 24151
SELECT count(distinct cep) from kx_br_ceps_from_osm where n>1;  -- 6648
SELECT count(distinct uf||municip) from kx_br_ceps_from_osm where n>1;  -- 1116
CREATE VIEW vw_kx_br_ceps_from_osm AS
      SELECT uf, municip, cep,
         round(SUM(sqrt(n)+ CASE WHEN n<4 THEN n-1 ELSE 2 END)) idx_confirm,
         max(idx_ref)||'-'||array_to_string(array_agg(distinct gtype),'') idx_ref 
      FROM kx_br_ceps_from_osm 
      -- WHERE uf=%L AND municip=%L 
      GROUP BY 1,2,3
      ORDER BY cep
;

Dados IBGE-agro2017

SELECT cod_municipio, count(distinct cep) ceps_distintos, count(*) ceps 
from ibge_agro2017 where cep is not null 
group by 1 
; -- em SP são 640 municípios
SELECT cod_municipio, lexlabel, count(distinct cep) ceps
     --  count(*) n, count(*) FILTER (WHERE is_cep_in_range) n_inrange
from vw_ibge_agro2017_full where cep is not null 
group by 1,2 having count(distinct cep)>1 order by 3 desc
; -- 446

Em SP 446 municípios com mais de um CEP distinto na região rural, deles 6 com mais de 100, a maioria (357) com menos de 10.

cod_municipio_sp lexlabel ceps distintos
50308 sao_paulo 204
9502 campinas 159
25904 jundiai 127
30607 mogi_cruzes 126
49805 sao_jose_rio_preto 120
52502 suzano 102
29005 marilia 76
38709 piracicaba 75
... ... ...

Criando arquivos

Linhas de shell para preparar o diretório /tmp/stable com o material:

rm -r /tmp/stable
# gera mkdir das pastas:
psql -c "select distinct 'mkdir /tmp/stable/'||uf from kx_br_ceps_from_osm"
#... rodar script gerado por copy/paste no terminal

Comando SQL para a formação dos arquivos .csv de CEPs de cada município, via SQL COPY:

-- select distinct 'mkdir /tmp/stable/'||uf from kx_br_ceps_from_osm;
-- chown  -R postgres:postgres /tmp/stable/
SELECT copy_csv(
  uf||'/'||municip||'.csv'
  ,format('
      SELECT cep, idx_confirm, idx_ref 
      FROM vw_kx_br_ceps_from_osm 
      WHERE uf=%L AND municip=%L
     ', uf, municip
   )
  ,true --header
  ,'/tmp/stable/'  -- base path
) FROM (
  select distinct uf,municip from kx_br_ceps_from_osm order by 1
) t;  -- 1787 arquivos. Quando houverem mais refs reuinir com ';'.

PS: para a situação do CEP no OSM em 2018 ver https://forum.openstreetmap.org/viewtopic.php?id=61604

A função de cópia faz parte da lib do projeto,

/**
 * COPY TO CSV HEADER.
 */
CREATE or replace FUNCTION copy_csv(
  p_filename  text,
  p_query     text,
  p_useheader boolean = true,
  p_root      text    = '/tmp/'
) RETURNS text AS $f$
BEGIN
  EXECUTE format(
    'COPY (%s) TO %L CSV %s'
    ,p_query
    ,p_root||p_filename
    ,CASE WHEN p_useheader THEN 'HEADER' ELSE '' END
  );
  RETURN p_filename;
END;
$f$ LANGUAGE plpgsql STRICT;
Clone this wiki locally