repos / pico

pico services mono repo
git clone https://github.com/picosh/pico.git

commit
dc6ed61
parent
23438de
author
Eric Bower
date
2026-05-04 23:37:38 -0400 EDT
chore: remove dev artifacts
2 files changed,  +0, -642
D cmd/scripts/compare-analytics/main.go
+0, -306
  1@@ -1,306 +0,0 @@
  2-package main
  3-
  4-import (
  5-	"flag"
  6-	"fmt"
  7-	"log/slog"
  8-	"os"
  9-	"time"
 10-
 11-	"github.com/picosh/pico/pkg/db"
 12-	"github.com/picosh/pico/pkg/db/postgres"
 13-)
 14-
 15-func main() {
 16-	host := flag.String("host", "", "host to compare (required)")
 17-	userID := flag.String("user-id", "", "user ID to compare (required)")
 18-	interval := flag.String("interval", "day", "interval: day, week, month")
 19-	origin := flag.String("origin", "", "origin date in YYYY-MM-DD format (default: start of year)")
 20-	flag.Parse()
 21-
 22-	if *host == "" || *userID == "" {
 23-		fmt.Fprintln(os.Stderr, "usage: compare-analytics -host example.com -user-id <uuid> [-interval day|week|month] [-origin 2025-01-01]")
 24-		os.Exit(1)
 25-	}
 26-
 27-	logger := slog.Default()
 28-	dbURL := os.Getenv("DATABASE_URL")
 29-	if dbURL == "" {
 30-		fmt.Fprintln(os.Stderr, "DATABASE_URL must be set")
 31-		os.Exit(1)
 32-	}
 33-
 34-	dbpool := postgres.NewDB(dbURL, logger)
 35-	defer func() { _ = dbpool.Close() }()
 36-
 37-	originTime := time.Date(time.Now().AddDate(-1, 0, 0).Year(), time.January, 1, 0, 0, 0, 0, time.UTC)
 38-	if *origin != "" {
 39-		var err error
 40-		originTime, err = time.Parse("2006-01-02", *origin)
 41-		if err != nil {
 42-			logger.Error("invalid origin format, use YYYY-MM-DD", "err", err)
 43-			os.Exit(1)
 44-		}
 45-	}
 46-
 47-	opts := &db.SummaryOpts{
 48-		Host:     *host,
 49-		UserID:   *userID,
 50-		Interval: *interval,
 51-		Origin:   originTime,
 52-	}
 53-
 54-	// New path: reads from summary tables + current month raw
 55-	newSummary, err := dbpool.VisitSummary(opts)
 56-	if err != nil {
 57-		logger.Error("VisitSummary failed", "err", err)
 58-		os.Exit(1)
 59-	}
 60-
 61-	// Old path: direct queries against analytics_visits
 62-	oldSummary, err := queryRawVisits(dbpool, opts)
 63-	if err != nil {
 64-		logger.Error("raw query failed", "err", err)
 65-		os.Exit(1)
 66-	}
 67-
 68-	// Compare
 69-	fmt.Println("=== Unique Visitors (Intervals) ===")
 70-	compareIntervals(newSummary.Intervals, oldSummary.Intervals)
 71-
 72-	fmt.Println("\n=== Top URLs ===")
 73-	compareUrls(newSummary.TopUrls, oldSummary.TopUrls)
 74-
 75-	fmt.Println("\n=== Top Referers ===")
 76-	compareUrls(newSummary.TopReferers, oldSummary.TopReferers)
 77-
 78-	fmt.Println("\n=== 404 URLs ===")
 79-	compareUrls(newSummary.NotFoundUrls, oldSummary.NotFoundUrls)
 80-}
 81-
 82-func queryRawVisits(dbpool *postgres.PsqlDB, opts *db.SummaryOpts) (*db.SummaryVisits, error) {
 83-	intervals, err := rawVisitUnique(dbpool, opts)
 84-	if err != nil {
 85-		return nil, fmt.Errorf("raw visit unique: %w", err)
 86-	}
 87-	urls, err := rawVisitUrl(dbpool, opts)
 88-	if err != nil {
 89-		return nil, fmt.Errorf("raw visit url: %w", err)
 90-	}
 91-	refs, err := rawVisitReferer(dbpool, opts)
 92-	if err != nil {
 93-		return nil, fmt.Errorf("raw visit referer: %w", err)
 94-	}
 95-	notFound, err := rawVisitUrlNotFound(dbpool, opts)
 96-	if err != nil {
 97-		return nil, fmt.Errorf("raw visit url not found: %w", err)
 98-	}
 99-	return &db.SummaryVisits{
100-		Intervals:    intervals,
101-		TopUrls:      urls,
102-		TopReferers:  refs,
103-		NotFoundUrls: notFound,
104-	}, nil
105-}
106-
107-func rawVisitUnique(dbpool *postgres.PsqlDB, opts *db.SummaryOpts) ([]*db.VisitInterval, error) {
108-	where, with := visitFilterBy(opts)
109-	query := fmt.Sprintf(`
110-		SELECT date_trunc('%s', created_at)::timestamptz as interval_start,
111-		       count(DISTINCT ip_address) as unique_visitors
112-		FROM analytics_visits
113-		WHERE created_at >= $1 AND %s = $2 AND user_id = $3 AND status <> 404
114-		GROUP BY interval_start
115-		ORDER BY interval_start`, opts.Interval, where)
116-
117-	rows, err := dbpool.Db.Queryx(query, opts.Origin, with, opts.UserID)
118-	if err != nil {
119-		return nil, err
120-	}
121-	defer func() { _ = rows.Close() }()
122-
123-	var intervals []*db.VisitInterval
124-	for rows.Next() {
125-		iv := &db.VisitInterval{}
126-		if err := rows.Scan(&iv.Interval, &iv.Visitors); err != nil {
127-			return nil, err
128-		}
129-		intervals = append(intervals, iv)
130-	}
131-	return intervals, rows.Err()
132-}
133-
134-func rawVisitUrl(dbpool *postgres.PsqlDB, opts *db.SummaryOpts) ([]*db.VisitUrl, error) {
135-	where, with := visitFilterBy(opts)
136-	query := fmt.Sprintf(`
137-		SELECT path as url, count(DISTINCT ip_address) as count
138-		FROM analytics_visits
139-		WHERE created_at >= $1 AND %s = $2 AND user_id = $3 AND path <> '' AND status <> 404
140-		GROUP BY path
141-		ORDER BY count DESC
142-		LIMIT 10`, where)
143-
144-	rows, err := dbpool.Db.Queryx(query, opts.Origin, with, opts.UserID)
145-	if err != nil {
146-		return nil, err
147-	}
148-	defer func() { _ = rows.Close() }()
149-
150-	var urls []*db.VisitUrl
151-	for rows.Next() {
152-		u := &db.VisitUrl{}
153-		if err := rows.Scan(&u.Url, &u.Count); err != nil {
154-			return nil, err
155-		}
156-		urls = append(urls, u)
157-	}
158-	return urls, rows.Err()
159-}
160-
161-func rawVisitReferer(dbpool *postgres.PsqlDB, opts *db.SummaryOpts) ([]*db.VisitUrl, error) {
162-	where, with := visitFilterBy(opts)
163-	query := fmt.Sprintf(`
164-		SELECT referer as url, count(DISTINCT ip_address) as count
165-		FROM analytics_visits
166-		WHERE created_at >= $1 AND %s = $2 AND user_id = $3 AND referer <> '' AND status <> 404
167-		GROUP BY referer
168-		ORDER BY count DESC
169-		LIMIT 10`, where)
170-
171-	rows, err := dbpool.Db.Queryx(query, opts.Origin, with, opts.UserID)
172-	if err != nil {
173-		return nil, err
174-	}
175-	defer func() { _ = rows.Close() }()
176-
177-	var urls []*db.VisitUrl
178-	for rows.Next() {
179-		u := &db.VisitUrl{}
180-		if err := rows.Scan(&u.Url, &u.Count); err != nil {
181-			return nil, err
182-		}
183-		urls = append(urls, u)
184-	}
185-	return urls, rows.Err()
186-}
187-
188-func rawVisitUrlNotFound(dbpool *postgres.PsqlDB, opts *db.SummaryOpts) ([]*db.VisitUrl, error) {
189-	where, with := visitFilterBy(opts)
190-	limit := opts.Limit
191-	if limit == 0 {
192-		limit = 10
193-	}
194-	query := fmt.Sprintf(`
195-		SELECT path as url, count(DISTINCT ip_address) as count
196-		FROM analytics_visits
197-		WHERE created_at >= $1 AND %s = $2 AND user_id = $3 AND path <> '' AND status = 404
198-		GROUP BY path
199-		ORDER BY count DESC
200-		LIMIT %d`, where, limit)
201-
202-	rows, err := dbpool.Db.Queryx(query, opts.Origin, with, opts.UserID)
203-	if err != nil {
204-		return nil, err
205-	}
206-	defer func() { _ = rows.Close() }()
207-
208-	var urls []*db.VisitUrl
209-	for rows.Next() {
210-		u := &db.VisitUrl{}
211-		if err := rows.Scan(&u.Url, &u.Count); err != nil {
212-			return nil, err
213-		}
214-		urls = append(urls, u)
215-	}
216-	return urls, rows.Err()
217-}
218-
219-func visitFilterBy(opts *db.SummaryOpts) (string, string) {
220-	if opts.Host != "" {
221-		return "host", opts.Host
222-	}
223-	if opts.Path != "" {
224-		return "path", opts.Path
225-	}
226-	return "host", ""
227-}
228-
229-func compareIntervals(newData, oldData []*db.VisitInterval) {
230-	newMap := make(map[string]int)
231-	for _, iv := range newData {
232-		key := iv.Interval.Format("2006-01-02")
233-		newMap[key] = iv.Visitors
234-	}
235-	oldMap := make(map[string]int)
236-	for _, iv := range oldData {
237-		key := iv.Interval.Format("2006-01-02")
238-		oldMap[key] = iv.Visitors
239-	}
240-
241-	// Collect all keys
242-	keys := make(map[string]bool)
243-	for k := range newMap {
244-		keys[k] = true
245-	}
246-	for k := range oldMap {
247-		keys[k] = true
248-	}
249-
250-	mismatches := 0
251-	for k := range keys {
252-		n, nOk := newMap[k]
253-		o, oOk := oldMap[k]
254-		if nOk && oOk {
255-			if n != o {
256-				fmt.Printf("  MISMATCH %s: new=%d old=%d\n", k, n, o)
257-				mismatches++
258-			}
259-		} else if nOk {
260-			fmt.Printf("  NEW ONLY %s: %d\n", k, n)
261-			mismatches++
262-		} else {
263-			fmt.Printf("  OLD ONLY %s: %d\n", k, o)
264-			mismatches++
265-		}
266-	}
267-	if mismatches == 0 {
268-		fmt.Println("  OK: all intervals match")
269-	} else {
270-		fmt.Printf("  %d mismatches\n", mismatches)
271-	}
272-}
273-
274-func compareUrls(newData, oldData []*db.VisitUrl) {
275-	newMap := make(map[string]int)
276-	for _, u := range newData {
277-		newMap[u.Url] = u.Count
278-	}
279-	oldMap := make(map[string]int)
280-	for _, u := range oldData {
281-		oldMap[u.Url] = u.Count
282-	}
283-
284-	mismatches := 0
285-	for url, nCount := range newMap {
286-		if oCount, ok := oldMap[url]; ok {
287-			if nCount != oCount {
288-				fmt.Printf("  MISMATCH %s: new=%d old=%d\n", url, nCount, oCount)
289-				mismatches++
290-			}
291-		} else {
292-			fmt.Printf("  NEW ONLY %s: %d\n", url, nCount)
293-			mismatches++
294-		}
295-	}
296-	for url, oCount := range oldMap {
297-		if _, ok := newMap[url]; !ok {
298-			fmt.Printf("  OLD ONLY %s: %d\n", url, oCount)
299-			mismatches++
300-		}
301-	}
302-	if mismatches == 0 {
303-		fmt.Println("  OK: all entries match")
304-	} else {
305-		fmt.Printf("  %d mismatches\n", mismatches)
306-	}
307-}
D docs/analytics-retention-proposal.md
+0, -336
  1@@ -1,336 +0,0 @@
  2-# Analytics Data Retention Proposal
  3-
  4-## Problem
  5-
  6-The `analytics_visits` table has **18 million rows** in production. All analytics queries (unique visitors, top URLs, top referers, 404s) aggregate raw visit data on every request, causing slow query performance. The table continues to grow unbounded.
  7-
  8-## Goal
  9-
 10-- Keep `analytics_visits` small by deleting raw data older than **1 month**
 11-- Pre-aggregate monthly stats into summary tables so historical queries remain fast
 12-- Preserve all data the user currently sees in the TUI and SSH CLI
 13-
 14----
 15-
 16-## What We Display Today
 17-
 18-### TUI Analytics Screen (`pkg/tui/analytics.go`)
 19-
 20-When a user views analytics for a site, we show:
 21-
 22-| Section | Query | Aggregation |
 23-|---------|-------|-------------|
 24-| **Site list** (left pane) | `visitHost()` | `COUNT(DISTINCT ip_address)` grouped by `host` |
 25-| **Visits by period** | `visitUnique()` | `COUNT(DISTINCT ip_address)` grouped by `date_trunc(interval, created_at)` |
 26-| **Top URLs** | `visitUrl()` | `COUNT(DISTINCT ip_address)` grouped by `path`, LIMIT 10, excludes 404s |
 27-| **Top Referers** | `visitReferer()` | `COUNT(DISTINCT ip_address)` grouped by `referer`, LIMIT 10, excludes 404s |
 28-| **Top 404s** | `VisitUrlNotFound()` | `COUNT(DISTINCT ip_address)` grouped by `path`, LIMIT 10, status=404 |
 29-
 30-The TUI supports two intervals:
 31-- **"day"** — daily buckets from start of current month (`StartOfMonth()`)
 32-- **"month"** — monthly buckets from 1 year ago (`StartOfYear()` which is `now - 1 year`)
 33-
 34-### SSH CLI (`pkg/apps/pico/cli.go`)
 35-
 36-| Command | Query | Aggregation |
 37-|---------|-------|-------------|
 38-| `not-found hostname.com [year\|month]` | `VisitUrlNotFound()` | `COUNT(DISTINCT ip_address)` grouped by `path`, LIMIT 100, status=404 |
 39-
 40----
 41-
 42-## Proposed Schema
 43-
 44-### New Table: `analytics_user_sites`
 45-
 46-Tracks every site a user has had traffic on. Powers the site list in the left pane of the TUI.
 47-
 48-```sql
 49-CREATE TABLE analytics_user_sites (
 50-    id              SERIAL PRIMARY KEY,
 51-    user_id         UUID NOT NULL REFERENCES app_users(id) ON DELETE CASCADE,
 52-    host            VARCHAR(253) NOT NULL,
 53-    total_visits    INT NOT NULL DEFAULT 0,       -- running total of all-time unique visitors
 54-    last_seen       DATE NOT NULL,                 -- most recent month with traffic
 55-    created_at      TIMESTAMP NOT NULL DEFAULT now(),
 56-    updated_at      TIMESTAMP NOT NULL DEFAULT now(),
 57-
 58-    UNIQUE (user_id, host)
 59-);
 60-
 61-CREATE INDEX idx_user_sites_user ON analytics_user_sites (user_id);
 62-```
 63-
 64-**How it's populated:** During monthly aggregation, for each `(user_id, host)` pair found in the previous month's raw data:
 65-- `INSERT ... ON CONFLICT (user_id, host) DO UPDATE` — add that month's unique visitor count to `total_visits`, set `last_seen` to the month's date.
 66-
 67-**Query replacement:** `visitHost()` becomes:
 68-```sql
 69-SELECT host, total_visits as host_count
 70-FROM analytics_user_sites
 71-WHERE user_id = $1
 72-ORDER BY total_visits DESC
 73-```
 74-
 75-No GROUP BY, no scan of raw data — just an indexed lookup.
 76-
 77-### New Table: `analytics_monthly_visits`
 78-
 79-Stores pre-aggregated daily unique visitor counts per host. This powers the "visits by period" chart and site list.
 80-
 81-```sql
 82-CREATE TABLE analytics_monthly_visits (
 83-    id              SERIAL PRIMARY KEY,
 84-    user_id         UUID NOT NULL REFERENCES app_users(id) ON DELETE CASCADE,
 85-    host            VARCHAR(253) NOT NULL,
 86-    visit_date      DATE NOT NULL,              -- the day this count is for
 87-    unique_visits   INT NOT NULL DEFAULT 0,     -- COUNT(DISTINCT ip_address) for that day
 88-    mobile_visits   INT NOT NULL DEFAULT 0,     -- unique visitors from mobile user-agents
 89-    desktop_visits  INT NOT NULL DEFAULT 0,     -- unique visitors from desktop user-agents
 90-    created_at      TIMESTAMP NOT NULL DEFAULT now(),
 91-
 92-    UNIQUE (user_id, host, visit_date)
 93-);
 94-
 95-CREATE INDEX idx_monthly_visits_user_host ON analytics_monthly_visits (user_id, host);
 96-CREATE INDEX idx_monthly_visits_user_date ON analytics_monthly_visits (user_id, visit_date DESC);
 97-```
 98-
 99-**Why this shape:** The TUI queries `date_trunc('day', created_at)` or `date_trunc('month', created_at)` with `COUNT(DISTINCT ip_address)`. We pre-compute the daily distinct count. For "month" interval view, the app sums daily rows within each month client-side or via a simple `GROUP BY date_trunc('month', visit_date)`.
100-
101-**Device detection:** `mobile_visits` and `desktop_visits` are derived from the `user_agent` column during aggregation. A user-agent library (e.g., [`pdericx/ua-parser`](https://github.com/pdericx/ua-parser) or [`mssola/useragent`](https://github.com/mssola/useragent)) classifies each visit. An IP counted as mobile if its user-agent is mobile, desktop otherwise. Note: `mobile_visits + desktop_visits = unique_visits` since every visit is one or the other.
102-
103-### New Table: `analytics_monthly_top_urls`
104-
105-Stores the top URLs per host per month. Powers "Top URLs" and "Top 404s" sections.
106-
107-```sql
108-CREATE TABLE analytics_monthly_top_urls (
109-    id            SERIAL PRIMARY KEY,
110-    user_id       UUID NOT NULL REFERENCES app_users(id) ON DELETE CASCADE,
111-    host          VARCHAR(253) NOT NULL,
112-    month         DATE NOT NULL,           -- first day of the month (e.g., 2025-01-01)
113-    path          VARCHAR(2048) NOT NULL,
114-    unique_visits INT NOT NULL DEFAULT 0,  -- COUNT(DISTINCT ip_address) for that path
115-    is_404        BOOLEAN NOT NULL DEFAULT false,
116-    rank          INT NOT NULL,            -- 1-10 (top 10) or 1-100 for CLI
117-    created_at    TIMESTAMP NOT NULL DEFAULT now(),
118-
119-    UNIQUE (user_id, host, month, path, is_404)
120-);
121-
122-CREATE INDEX idx_monthly_top_urls_user_host_month ON analytics_monthly_top_urls (user_id, host, month);
123-CREATE INDEX idx_monthly_top_urls_404 ON analytics_monthly_top_urls (user_id, host, month, is_404, rank);
124-```
125-
126-### New Table: `analytics_monthly_top_referers`
127-
128-Stores top referers per host per month. Powers "Top Referers" section.
129-
130-```sql
131-CREATE TABLE analytics_monthly_top_referers (
132-    id            SERIAL PRIMARY KEY,
133-    user_id       UUID NOT NULL REFERENCES app_users(id) ON DELETE CASCADE,
134-    host          VARCHAR(253) NOT NULL,
135-    month         DATE NOT NULL,           -- first day of the month
136-    referer       VARCHAR(253) NOT NULL,
137-    unique_visits INT NOT NULL DEFAULT 0,
138-    rank          INT NOT NULL,            -- 1-10
139-    created_at    TIMESTAMP NOT NULL DEFAULT now(),
140-
141-    UNIQUE (user_id, host, month, referer)
142-);
143-
144-CREATE INDEX idx_monthly_referers_user_host_month ON analytics_monthly_top_referers (user_id, host, month);
145-```
146-
147----
148-
149-## Monthly Aggregation Job
150-
151-A script (e.g., `cmd/scripts/analytics-aggregate/main.go`) runs **once per month** (e.g., on the 1st). It:
152-
153-### Step 1: Aggregate the previous month
154-
155-For each user with the `analytics` feature flag who has raw visits in the previous month:
156-
157-```
158-FOR each (user_id, host) pair in analytics_visits WHERE created_at IN previous_month:
159-    -- Daily unique visitors (with device breakdown)
160-    -- Device detection runs in Go (user-agent parsing), not pure SQL.
161-    -- Pseudo-SQL for clarity:
162-    INSERT INTO analytics_monthly_visits (user_id, host, visit_date, unique_visits, mobile_visits, desktop_visits)
163-    SELECT user_id, host, date_trunc('day', created_at)::date,
164-           COUNT(DISTINCT ip_address),
165-           COUNT(DISTINCT CASE WHEN is_mobile(user_agent) THEN ip_address END),
166-           COUNT(DISTINCT CASE WHEN NOT is_mobile(user_agent) THEN ip_address END)
167-    FROM analytics_visits
168-    WHERE user_id = $1 AND host = $2 AND created_at >= start_of_month AND created_at < start_of_next_month
169-    GROUP BY user_id, host, date_trunc('day', created_at)
170-    ON CONFLICT (user_id, host, visit_date) DO NOTHING;
171-
172-    -- In practice, `is_mobile()` is a Go function. The aggregation job fetches
173-    -- raw rows in batches, classifies each user_agent, and issues INSERTs.
174-
175-    -- Top 10 URLs (non-404)
176-    INSERT INTO analytics_monthly_top_urls (user_id, host, month, path, unique_visits, is_404, rank)
177-    SELECT user_id, host, date_trunc('month', created_at)::date, path, COUNT(DISTINCT ip_address), false,
178-           ROW_NUMBER() OVER (PARTITION BY user_id, host ORDER BY COUNT(DISTINCT ip_address) DESC)
179-    FROM analytics_visits
180-    WHERE user_id = $1 AND host = $2 AND created_at >= start_of_month AND created_at < start_of_next_month AND status <> 404
181-    GROUP BY user_id, host, path
182-    HAVING ROW_NUMBER() <= 10
183-    ON CONFLICT (user_id, host, month, path, is_404) DO NOTHING;
184-
185-    -- Top 10 URLs (404s)
186-    INSERT INTO analytics_monthly_top_urls (user_id, host, month, path, unique_visits, is_404, rank)
187-    SELECT user_id, host, date_trunc('month', created_at)::date, path, COUNT(DISTINCT ip_address), true,
188-           ROW_NUMBER() OVER (PARTITION BY user_id, host ORDER BY COUNT(DISTINCT ip_address) DESC)
189-    FROM analytics_visits
190-    WHERE user_id = $1 AND host = $2 AND created_at >= start_of_month AND created_at < start_of_next_month AND status = 404
191-    GROUP BY user_id, host, path
192-    HAVING ROW_NUMBER() <= 100   -- 100 for CLI, 10 for TUI (use 100 to cover both)
193-    ON CONFLICT (user_id, host, month, path, is_404) DO NOTHING;
194-
195-    -- Top 10 referers
196-    INSERT INTO analytics_monthly_top_referers (user_id, host, month, referer, unique_visits, rank)
197-    SELECT user_id, host, date_trunc('month', created_at)::date, referer, COUNT(DISTINCT ip_address),
198-           ROW_NUMBER() OVER (PARTITION BY user_id, host ORDER BY COUNT(DISTINCT ip_address) DESC)
199-    FROM analytics_visits
200-    WHERE user_id = $1 AND host = $2 AND created_at >= start_of_month AND created_at < start_of_next_month AND referer <> '' AND status <> 404
201-    GROUP BY user_id, host, referer
202-    HAVING ROW_NUMBER() <= 10
203-    ON CONFLICT (user_id, host, month, referer) DO NOTHING;
204-```
205-
206-### Step 2: Delete raw data older than 1 month
207-
208-```sql
209-DELETE FROM analytics_visits
210-WHERE created_at < date_trunc('month', now())::timestamp;
211-```
212-
213-This keeps only the current month's raw data. After the first run, `analytics_visits` should drop from ~18M rows to ~1 month of data (estimated 150K–500K depending on traffic).
214-
215-### Step 3: Backfill (one-time)
216-
217-On first deployment, run the aggregation for all historical months before deleting anything. This is a separate migration pass that iterates month-by-month from the oldest data to 2 months ago.
218-
219----
220-
221-## Query Changes
222-
223-### Current queries → New queries
224-
225-Each existing query method gets a `since` parameter. If `since` is within the last month, query `analytics_visits` directly (current behavior). If `since` spans older months, union results from summary tables with raw data.
226-
227-#### `visitUnique()` — visits by period
228-
229-```
230--- For "day" interval (current month only, always from raw):
231--- No change — reads from analytics_visits
232-
233--- For "month" interval (historical):
234--- Read from analytics_monthly_visits, sum daily counts per month:
235-SELECT visit_date::date as interval_start, SUM(unique_visits) as visitors
236-FROM analytics_monthly_visits
237-WHERE user_id = $1 AND host = $2 AND visit_date >= $3
238-GROUP BY date_trunc('month', visit_date), visit_date
239-ORDER BY visit_date
240-```
241-
242-#### `visitUrl()` — top URLs
243-
244-```
245--- Current month: read from analytics_visits (no change)
246--- Historical months: read from analytics_monthly_top_urls WHERE is_404 = false
247-```
248-
249-#### `visitReferer()` — top referers
250-
251-```
252--- Current month: read from analytics_visits (no change)  
253--- Historical months: read from analytics_monthly_top_referers
254-```
255-
256-#### `VisitUrlNotFound()` — top 404s
257-
258-```
259--- Current month: read from analytics_visits (no change)
260--- Historical months: read from analytics_monthly_top_urls WHERE is_404 = true
261-```
262-
263-#### `visitHost()` — site list
264-
265-```
266--- Replaced entirely by analytics_user_sites lookup:
267-SELECT host, total_visits as host_count
268-FROM analytics_user_sites
269-WHERE user_id = $1
270-ORDER BY total_visits DESC
271-```
272-
273-No raw table scan. Current month's new hosts are upserted during the monthly aggregation job.
274-
275----
276-
277-## Implementation Phases
278-
279-### Phase 1: Schema + Backfill (no behavior change)
280-1. Create the four new tables (user_sites, monthly_visits, monthly_top_urls, monthly_top_referers)
281-2. Write the aggregation script (`cmd/scripts/analytics-aggregate/`)
282-3. Run backfill for all historical data
283-4. **Do not delete any data yet** — verify summary tables are correct
284-
285-### Phase 2: Query migration
286-1. Add new DB interface methods that read from summary tables
287-2. Modify existing query methods to union summary + raw data
288-3. Verify TUI and CLI output matches before/after
289-4. Run `make check`
290-
291-### Phase 3: Enable deletion
292-1. Add raw data deletion to the aggregation job
293-2. Schedule the job to run monthly (cron, systemd timer, or app-level scheduler)
294-3. Monitor `analytics_visits` row count after first deletion
295-
296-### Phase 4: Cleanup
297-1. Remove any code paths that no longer need raw historical data
298-2. Consider dropping unused indexes on `analytics_visits` since the table is now small
299-
300----
301-
302-## Trade-offs
303-
304-| Aspect | Before | After |
305-|--------|--------|-------|
306-| `analytics_visits` size | 18M+ rows (unbounded) | ~1 month of data (~150K-500K) |
307-| Query latency | Scans full table each time | Reads small table or pre-aggregated rows |
308-| Historical granularity | Full raw data | Monthly top-10/100 summaries |
309-| Disk overhead | Single table | 4 additional tables (user_sites, monthly_visits, monthly_top_urls, monthly_top_referers) |
310-| Mobile/desktop | Not tracked | Tracked via user-agent classification during aggregation |
311-| Data loss | None | Raw data older than 1 month is gone |
312-
313-The key trade-off is that we lose the ability to run *arbitrary* ad-hoc queries on historical data. But we preserve exactly what the user sees today in the TUI and CLI. If we need new historical queries in the future, we'd add them to the aggregation job.
314-
315----
316-
317-## Open Questions
318-
319-1. **Scheduling**: How do we run the monthly job? Options:
320-   - Cron job on the server
321-   - `systemd` timer
322-   - Built into one of our services as a startup check
323-   - External scheduler (GitHub Actions, etc.)
324-
325-2. **`visitHost()` (site list)**: Solved by `analytics_user_sites` table — simple indexed lookup, no aggregation needed.
326-
327-3. **Retention period**: Is 1 month the right window for raw data? Could adjust based on observed query patterns.
328-
329-4. **Concurrent writes**: During the aggregation window, new visits are still being inserted. The job needs to handle this (e.g., aggregate up to a specific timestamp, then delete up to that same timestamp).
330-
331-5. **Users who disable analytics**: The banner says "when analytics are disabled we do not purge usage statistics." Deletion should respect this — only delete for users with the `analytics` feature flag active.
332-
333-6. **Device detection library**: Which Go user-agent parser to use for mobile/desktop classification? Candidates:
334-   - [`mssola/useragent`](https://github.com/mssola/useragent) — lightweight, device.OS / device.Type
335-   - [`pdericx/ua-parser`](https://github.com/pdericx/ua-parser) — fuller UAParser port
336-   - [`danielgtaylor/mobile`](https://github.com/danielgtaylor/mobile) — simple mobile-only detection
337-   The existing codebase already uses `x-way/crawlerdetect` for bot filtering, so a similar pattern fits.