Antonio Mika
·
2025-03-12
analytics.go
1package shared
2
3import (
4 "context"
5 "crypto/hmac"
6 "crypto/sha256"
7 "encoding/hex"
8 "encoding/json"
9 "errors"
10 "fmt"
11 "log/slog"
12 "net"
13 "net/http"
14 "net/url"
15 "strings"
16 "time"
17
18 "github.com/picosh/pico/pkg/db"
19 "github.com/picosh/utils/pipe/metrics"
20 "github.com/simplesurance/go-ip-anonymizer/ipanonymizer"
21 "github.com/x-way/crawlerdetect"
22)
23
24var internalCrawlers *crawlerdetect.CrawlerDetect
25
26func init() {
27 internalCrawlers = crawlerdetect.New()
28 internalCrawlers.SetCrawlers([]string{
29 `^Azure Traffic Manager Endpoint Monitor$`,
30 `^Blackbox Exporter\/`,
31 `^Prometheus\/`,
32 })
33}
34
35func HmacString(secret, data string) string {
36 hmacer := hmac.New(sha256.New, []byte(secret))
37 hmacer.Write([]byte(data))
38 dataHmac := hmacer.Sum(nil)
39 return hex.EncodeToString(dataHmac)
40}
41
42func trackableUserAgent(agent string) error {
43 // dont store requests from bots
44 if crawlerdetect.IsCrawler(agent) || internalCrawlers.IsCrawler(agent) {
45 return fmt.Errorf(
46 "request is likely from a bot (User-Agent: %s)",
47 CleanUserAgent(agent),
48 )
49 }
50 return nil
51}
52
53func trackableRequest(r *http.Request) error {
54 agent := r.UserAgent()
55 return trackableUserAgent(agent)
56}
57
58func cleanIpAddress(ip string) (string, error) {
59 host, _, err := net.SplitHostPort(ip)
60 if err != nil {
61 host = ip
62 }
63 // /24 IPv4 subnet mask
64 // /64 IPv6 subnet mask
65 anonymizer := ipanonymizer.NewWithMask(
66 net.CIDRMask(24, 32),
67 net.CIDRMask(64, 128),
68 )
69 anonIp, err := anonymizer.IPString(host)
70 return anonIp, err
71}
72
73func cleanUrl(orig string) (string, string) {
74 u, err := url.Parse(orig)
75 if err != nil {
76 return "", ""
77 }
78 return u.Host, u.Path
79}
80
81func cleanUrlFromRequest(r *http.Request) (string, string) {
82 host := r.Header.Get("x-forwarded-host")
83 if host == "" {
84 host = r.URL.Host
85 }
86 if host == "" {
87 host = r.Host
88 }
89 // we don't want query params in the url for security reasons
90 return host, r.URL.Path
91}
92
93func CleanUserAgent(ua string) string {
94 // truncate user-agent because http headers have no text limit
95 if len(ua) > 1000 {
96 return ua[:1000]
97 }
98 return strings.TrimSpace(ua)
99}
100
101func filterIp(host string) (string, error) {
102 if host == "" {
103 return "", nil
104 }
105 addr := net.ParseIP(host)
106 if addr != nil {
107 return "", fmt.Errorf("host is an ip")
108 }
109 return host, nil
110}
111
112func CleanReferer(raw string) (string, error) {
113 ref := raw
114 if ref == "" {
115 return "", nil
116 }
117 // referer sometimes dont include scheme but we need it
118 if !strings.HasPrefix(ref, "http") {
119 ref = "https://" + ref
120 }
121 // we only want to store host for security reasons
122 // https://developer.mozilla.org/en-US/docs/Web/Security/Referer_header:_privacy_and_security_concerns
123 u, err := url.Parse(ref)
124 if err != nil {
125 return "", err
126 }
127 hostname := u.Hostname()
128 hostname, _ = filterIp(hostname)
129 hostname = strings.TrimSpace(strings.ToLower(hostname))
130 return hostname, err
131}
132
133func CleanHost(raw string) (string, error) {
134 prep := strings.TrimSpace(strings.ToLower(raw))
135 if prep == "" {
136 return "", fmt.Errorf("host is blank")
137 }
138 // hosts dont usually include scheme but we need it
139 if !strings.HasPrefix(prep, "http") {
140 prep = "https://" + prep
141 }
142 // no clue why but our prod data contains periods
143 prep = strings.Trim(prep, ".")
144 // we only want to store host for security reasons
145 // https://developer.mozilla.org/en-US/docs/Web/Security/Referer_header:_privacy_and_security_concerns
146 u, err := url.Parse(prep)
147 if err != nil {
148 return raw, err
149 }
150 host := u.Hostname()
151 host, err = filterIp(host)
152 return host, err
153}
154
155var ErrAnalyticsDisabled = errors.New("owner does not have site analytics enabled")
156
157func AnalyticsVisitFromVisit(visit *db.AnalyticsVisits, dbpool db.DB, secret string) error {
158 if !dbpool.HasFeatureForUser(visit.UserID, "analytics") {
159 return ErrAnalyticsDisabled
160 }
161
162 err := trackableUserAgent(visit.UserAgent)
163 if err != nil {
164 return err
165 }
166
167 ipAddress, err := cleanIpAddress(visit.IpAddress)
168 if err != nil {
169 return err
170 }
171 visit.IpAddress = HmacString(secret, ipAddress)
172 _, path := cleanUrl(visit.Path)
173 visit.Path = path
174
175 referer, err := CleanReferer(visit.Referer)
176 if err != nil {
177 return err
178 }
179 visit.Referer = referer
180
181 hostname, err := CleanHost(visit.Host)
182 if err != nil {
183 return err
184 }
185 visit.Host = hostname
186 visit.UserAgent = CleanUserAgent(visit.UserAgent)
187
188 return nil
189}
190
191func ipFromRequest(r *http.Request) string {
192 // https://caddyserver.com/docs/caddyfile/directives/reverse_proxy#defaults
193 ipOrig := r.Header.Get("x-forwarded-for")
194 if ipOrig == "" {
195 ipOrig = r.RemoteAddr
196 }
197 // probably means this is a web tunnel
198 if ipOrig == "" || ipOrig == "@" {
199 sshCtx, err := GetSshCtx(r)
200 if err == nil {
201 ipOrig = sshCtx.RemoteAddr().String()
202 }
203 }
204
205 return ipOrig
206}
207
208func AnalyticsVisitFromRequest(r *http.Request, dbpool db.DB, userID string) (*db.AnalyticsVisits, error) {
209 if !dbpool.HasFeatureForUser(userID, "analytics") {
210 return nil, ErrAnalyticsDisabled
211 }
212
213 err := trackableRequest(r)
214 if err != nil {
215 return nil, err
216 }
217
218 ipAddress := ipFromRequest(r)
219 host, path := cleanUrlFromRequest(r)
220
221 return &db.AnalyticsVisits{
222 UserID: userID,
223 Host: host,
224 Path: path,
225 IpAddress: ipAddress,
226 UserAgent: r.UserAgent(),
227 Referer: r.Referer(),
228 Status: http.StatusOK,
229 }, nil
230}
231
232func AnalyticsCollect(ch chan *db.AnalyticsVisits, dbpool db.DB, logger *slog.Logger) {
233 drain := metrics.RegisterReconnectMetricRecorder(
234 context.Background(),
235 logger,
236 NewPicoPipeClient(),
237 100,
238 10*time.Millisecond,
239 )
240
241 for visit := range ch {
242 data, err := json.Marshal(visit)
243 if err != nil {
244 logger.Error("could not json marshall visit record", "err", err)
245 continue
246 }
247
248 data = append(data, '\n')
249
250 _, err = drain.Write(data)
251 if err != nil {
252 logger.Error("could not write to metric-drain", "err", err)
253 }
254 }
255}