Skip to content

Latest commit

 

History

History
424 lines (368 loc) · 9.65 KB

File metadata and controls

424 lines (368 loc) · 9.65 KB

Create

2 terminal windows

Infra

(cd 005_unnecessary_dw_workloads/analytics_in_cockroachdb && docker compose up -d)
docker exec -it node1 cockroach init --insecure
docker exec -it node1 cockroach sql --insecure

Enable enterprise

enterprise -url "postgres://root@localhost:26257?sslmode=disable"

Create table and populate

CREATE TABLE customers (
  id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
  email STRING UNIQUE NOT NULL
);

CREATE TABLE products (
  id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
  name STRING NOT NULL,
  price DECIMAL NOT NULL
);

CREATE TABLE orders (
  id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
  customer_id UUID NOT NULL REFERENCES customers(id),
  ts TIMESTAMPTZ NOT NULL DEFAULT now(),
  total DECIMAL NOT NULL
);

CREATE TABLE order_items (
  id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
  order_id UUID NOT NULL REFERENCES orders(id),
  product_id UUID NOT NULL REFERENCES products(id),
  quantity INTEGER NOT NULL
);

CREATE TABLE payments (
  id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
  order_id UUID REFERENCES orders(id),
  ts TIMESTAMPTZ DEFAULT now(),
  amount DECIMAL NOT NULL
);

Generate data

dg \
  -c 005_unnecessary_dw_workloads/analytics_in_cockroachdb/dg.yaml \
  -o 005_unnecessary_dw_workloads/analytics_in_cockroachdb/csvs \
  -i imports.sql

python3 \
  -m http.server 9090 \
  -d 005_unnecessary_dw_workloads/analytics_in_cockroachdb/csvs

Import data

IMPORT INTO customers (
	id, email
)
CSV DATA (
    'http://host.docker.internal:9090/customers.csv'
)
WITH skip='1', nullif = '', allow_quoted_null;

IMPORT INTO products (
	id, name, price
)
CSV DATA (
    'http://host.docker.internal:9090/products.csv'
)
WITH skip='1', nullif = '', allow_quoted_null;

IMPORT INTO orders (
	id, customer_id, ts, total
)
CSV DATA (
    'http://host.docker.internal:9090/orders.csv'
)
WITH skip='1', nullif = '', allow_quoted_null;

IMPORT INTO order_items (
	id, order_id, product_id, quantity
)
CSV DATA (
    'http://host.docker.internal:9090/order_items.csv'
)
WITH skip='1', nullif = '', allow_quoted_null;

IMPORT INTO payments (
	order_id, id, ts, amount
)
CSV DATA (
    'http://host.docker.internal:9090/payments.csv'
)
WITH skip='1', nullif = '', allow_quoted_null;

Simulate transactional workload

go run 005_unnecessary_dw_workloads/analytics_in_cockroachdb/main.go

k6 run 005_unnecessary_dw_workloads/analytics_in_cockroachdb/load.js

DEBUG Test transactional workload

# Insert customer
curl "http://localhost:3000/customers" \
  -H 'Content-Type: application/json' \
  -d '{
    "id": "68b790f4-9527-4a51-b0fd-b530613f34a9",
    "email": "[email protected]"
  }'

# Get products
curl "http://localhost:3000/products" | jq

# Insert order
curl "http://localhost:3000/orders" \
  -H 'Content-Type: application/json' \
  -d '{
    "id": "318c7a41-aacb-4166-9179-706d4e60de83",
    "customer_id": "68b790f4-9527-4a51-b0fd-b530613f34a9",
    "items": [
      {
        "id": "c4c12e8f-6dc3-48d3-86ea-93cb01ee63c0",
        "quantity": 48
      },
      {
        "id": "9dcfbcd2-1848-4b50-b3f5-73b09d70d5be",
        "quantity": 1
      }
    ],
    "total": 100
  }'

Analytics

Setup

CREATE ROLE analytics WITH login;
GRANT SELECT ON * TO analytics;

CREATE USER analytics_user;
GRANT analytics TO analytics_user;

ALTER ROLE analytics SET default_transaction_use_follower_reads = 'on';
ALTER ROLE analytics SET default_transaction_priority = 'low';
ALTER ROLE analytics SET default_transaction_read_only = 'on';
ALTER ROLE analytics SET statement_timeout = '10m';

-- Remove some payments for the analytics queries.
DELETE FROM payments p
WHERE true
ORDER BY random()
LIMIT 5;
cockroach sql --url "postgres://analytics@localhost:26257/defaultdb?sslmode=disable" --insecure

Queries

-- Fetch a customer and their orders.
SELECT
  c.email,
  o.id,
  o.total,
  oi.quantity,
  p.price
FROM customers c
LEFT JOIN orders o ON c.id = o.customer_id
LEFT JOIN order_items oi ON o.id = oi.order_id
LEFT JOIN products p ON oi.product_id = p.id
WHERE c.id = '0a3546b5-6ad3-49b2-b960-dc6958faca30'
ORDER BY c.id, o.id, oi.id;

-- Show user-specific variables.
SHOW TRANSACTION PRIORITY;

-- Busiest months in history.
SELECT
  date_trunc('month', ts)::DATE mth,
  COUNT(*)
FROM orders
AS OF SYSTEM TIME follower_read_timestamp()
GROUP BY date_trunc('month', ts) 
ORDER BY count DESC
LIMIT 10;

-- Most profitable months in history.
SELECT
  date_trunc('month', o.ts) AS month,
  SUM(o.total) AS monthly_revenue
FROM orders o
AS OF SYSTEM TIME follower_read_timestamp()
GROUP BY month
ORDER BY monthly_revenue DESC
LIMIT 10;

-- Biggest spenders.
SELECT
  c.email,
  SUM(o.total) AS total_spend,
  COUNT(o.id) AS order_count,
  ROUND(SUM(o.total) / COUNT(o.id)) AS order_average
FROM customers c
JOIN orders o ON c.id = o.customer_id
AS OF SYSTEM TIME follower_read_timestamp()
GROUP BY c.email
ORDER BY total_spend DESC
LIMIT 10;

-- Biggest average spenders.
SELECT
  c.email,
  ROUND(AVG(o.total)) AS average_spend
FROM customers c
JOIN orders o ON c.id = o.customer_id
AS OF SYSTEM TIME follower_read_timestamp()
GROUP BY c.email
ORDER BY average_spend DESC
LIMIT 10;

-- Most popular products.
SELECT
  p.name AS product,
  SUM(oi.quantity) AS total_quantity_sold
FROM products p
JOIN order_items oi ON p.id = oi.product_id
AS OF SYSTEM TIME follower_read_timestamp()
GROUP BY p.name
ORDER BY total_quantity_sold DESC
LIMIT 10;

-- Least popular products.
SELECT
  p.name AS product,
  SUM(oi.quantity) AS total_quantity_sold
FROM products p
JOIN order_items oi ON p.id = oi.product_id
AS OF SYSTEM TIME follower_read_timestamp()
GROUP BY p.name
ORDER BY total_quantity_sold
LIMIT 10;

-- Idle customers.
SELECT
  c.email,
  MAX(o.ts) AS latest_order_date
FROM customers c
JOIN orders o ON c.id = o.customer_id
AS OF SYSTEM TIME follower_read_timestamp()
GROUP BY c.email
ORDER BY latest_order_date
LIMIT 10;

-- Orders pending payment.
SELECT
  o.id AS order_id,
  o.customer_id,
  o.total
FROM orders o
LEFT JOIN payments p ON o.id = p.order_id
AS OF SYSTEM TIME follower_read_timestamp()
WHERE p.id IS NULL
ORDER BY o.total DESC;

-- Product affinity.
WITH product_combinations AS (
  SELECT
    oi1.product_id AS product1,
    oi2.product_id AS product2,
    COUNT(DISTINCT oi1.order_id) AS order_count
  FROM order_items oi1
  JOIN order_items oi2
    ON oi1.order_id = oi2.order_id
    AND oi1.product_id != oi2.product_id
  GROUP BY oi1.product_id, oi2.product_id
)
SELECT
  product1,
  product2,
  order_count
FROM product_combinations
ORDER BY order_count DESC
LIMIT 10;

-- RFM (Recency, Frequency, Monetary) analysis.
-- To: Identify high-worth customers who've not purchased in a while.
WITH customer_rfm AS (
  SELECT
    customer_id,
    now()::DATE - MAX(ts)::DATE AS recency,
    COUNT(DISTINCT o.id) AS frequency,
    SUM(o.total) AS monetary
  FROM orders o
  WHERE o.ts <= now()
  GROUP BY customer_id
)
SELECT
  customer_id,
  recency,
  frequency,
  monetary
FROM customer_rfm
WHERE frequency >= 100
AND monetary >= 10000
ORDER BY recency DESC, frequency DESC, monetary DESC;

-- Product sales monthly moving average.
WITH product_sales AS (
  SELECT
    p.id AS product_id,
    DATE_TRUNC('year', o.ts) AS year,
    SUM(oi.quantity) AS sold
  FROM products p
  LEFT JOIN order_items oi ON p.id = oi.product_id
  LEFT JOIN orders o ON oi.order_id = o.id
  GROUP BY p.id, year
)
SELECT
  p.name,
  EXTRACT('year', ps.year),
  ps.sold,
  TRUNC(AVG(ps.sold) OVER (PARTITION BY ps.product_id ORDER BY ps.year ROWS BETWEEN 1 PRECEDING AND CURRENT ROW)) AS moving_average,
  COALESCE(ps.sold - LAG(ps.sold) OVER (PARTITION BY ps.product_id), 0) AS year_diff
FROM product_sales ps
JOIN products p ON ps.product_id = p.id
WHERE year >= '2020-01-01'
ORDER BY p.name, ps.year;

-- Customer churn prediction.
WITH
  customer_purchases AS (
    SELECT
      c.id AS customer_id,
      c.email,
      MAX(o.ts)::DATE AS last_purchase_date,
      COUNT(DISTINCT o.id) AS total_order_count,
      COUNT(DISTINCT CASE WHEN o.ts >= CURRENT_DATE - INTERVAL '30 days' THEN o.id END) AS orders_last_month,
      COUNT(DISTINCT CASE WHEN o.ts >= CURRENT_DATE - INTERVAL '1 year' THEN o.id END) AS orders_last_year
    FROM
      customers c
    LEFT JOIN
      orders o ON c.id = o.customer_id
    GROUP BY
      c.id, c.email
  ),
  customer_churn_risk AS (
    SELECT
      email,
      last_purchase_date,
      total_order_count,
      orders_last_year,
      orders_last_month,
      CASE
        WHEN total_order_count > 100
          AND NOW()::DATE - MAX(last_purchase_date)::DATE > 90
          AND orders_last_year < 30
          THEN 'high risk'
        WHEN NOW()::DATE - MAX(last_purchase_date)::DATE > 30
          AND orders_last_month < 10
          THEN 'medium risk'
        ELSE 'low risk'
      END AS churn_status
    FROM customer_purchases
    GROUP BY email, last_purchase_date, total_order_count, orders_last_year, orders_last_month
  )
SELECT
  email,
  total_order_count,
  orders_last_year,
  orders_last_month,
  churn_status
FROM customer_churn_risk
WHERE orders_last_year > 0
AND churn_status != 'low risk'
ORDER BY churn_status DESC, total_order_count DESC, orders_last_year DESC, orders_last_month DESC;

Scratchpad

-- ef8b898b-070b-4710-b57c-caaddcd09d3a + 6e01fd2d-e761-4414-87a7-35d0c6e63ff7 = 305
SELECT
  oi.order_id,
  COUNT(*)
FROM order_items oi
WHERE oi.product_id IN ('ef8b898b-070b-4710-b57c-caaddcd09d3a', '6e01fd2d-e761-4414-87a7-35d0c6e63ff7')
GROUP BY oi.order_id;

Summary

  • Follower reads won't interfere with other transactions or cause retries.
  • Follower read transaction will always run without interruption, as they won't get pushed because of writes occurring mid query.

Teardown

make teardown