Closes #01

2026-05-18 15:31:52 -04:00
parent 269d560847
commit 41be5a2e24
3 changed files with 384 additions and 239 deletions
--- a/analysis/03-Two_Sum_Is_Not_About_Numbers/examples/two_sum_logs_demo.cpp
+++ b/analysis/03-Two_Sum_Is_Not_About_Numbers/examples/two_sum_logs_demo.cpp
@@ -0,0 +1,223 @@
+#include <algorithm>
+#include <cstdint>
+#include <deque>
+#include <iomanip>
+#include <iostream>
+#include <sstream>
+#include <stdexcept>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+struct Event {
+    std::int64_t timestamp_ms;
+    std::string service;
+    std::string event;
+    int latency_ms;
+    std::string request_id;
+    std::string raw_line;
+};
+
+struct PairResult {
+    bool found = false;
+    Event first;
+    Event second;
+};
+
+std::int64_t parseTimestampMs(const std::string& timestamp)
+{
+    // Expected format:
+    // YYYY-MM-DDTHH:MM:SS.mmmZ
+    // For this demo we only convert the HH:MM:SS.mmm part to milliseconds.
+    const std::size_t t_pos = timestamp.find('T');
+    const std::size_t z_pos = timestamp.find('Z');
+
+    if (t_pos == std::string::npos || z_pos == std::string::npos) {
+        throw std::runtime_error("Invalid timestamp: " + timestamp);
+    }
+
+    const std::string time_part = timestamp.substr(t_pos + 1, z_pos - t_pos - 1);
+
+    int hours = 0;
+    int minutes = 0;
+    int seconds = 0;
+    int millis = 0;
+    char colon1 = '\0';
+    char colon2 = '\0';
+    char dot = '\0';
+
+    std::istringstream iss(time_part);
+    iss >> hours >> colon1 >> minutes >> colon2 >> seconds >> dot >> millis;
+
+    if (!iss || colon1 != ':' || colon2 != ':' || dot != '.') {
+        throw std::runtime_error("Invalid time part: " + time_part);
+    }
+
+    return (((hours * 60LL) + minutes) * 60LL + seconds) * 1000LL + millis;
+}
+
+Event parseLogLine(const std::string& line)
+{
+    std::istringstream iss(line);
+
+    std::string timestamp;
+    std::string service_token;
+    std::string event_token;
+    std::string latency_token;
+    std::string request_token;
+
+    if (!(iss >> timestamp >> service_token >> event_token >> latency_token >> request_token)) {
+        throw std::runtime_error("Cannot parse log line: " + line);
+    }
+
+    auto valueAfterEquals = [](const std::string& token) -> std::string {
+        const std::size_t pos = token.find('=');
+        if (pos == std::string::npos || pos + 1 >= token.size()) {
+            throw std::runtime_error("Invalid token: " + token);
+        }
+        return token.substr(pos + 1);
+    };
+
+    Event result;
+    result.timestamp_ms = parseTimestampMs(timestamp);
+    result.service = valueAfterEquals(service_token);
+    result.event = valueAfterEquals(event_token);
+
+    std::string latency_value = valueAfterEquals(latency_token);
+    if (latency_value.size() < 3 || latency_value.substr(latency_value.size() - 2) != "ms") {
+        throw std::runtime_error("Invalid latency token: " + latency_token);
+    }
+    latency_value.erase(latency_value.size() - 2);
+    result.latency_ms = std::stoi(latency_value);
+
+    result.request_id = valueAfterEquals(request_token);
+    result.raw_line = line;
+
+    return result;
+}
+
+std::vector<Event> parseLogs(const std::vector<std::string>& lines)
+{
+    std::vector<Event> events;
+    events.reserve(lines.size());
+
+    for (const std::string& line : lines) {
+        events.push_back(parseLogLine(line));
+    }
+
+    return events;
+}
+
+void printPair(const PairResult& result, const std::string& label)
+{
+    std::cout << label << '\n';
+
+    if (!result.found) {
+        std::cout << "  no pair found\n\n";
+        return;
+    }
+
+    std::cout << "  first : " << result.first.raw_line << '\n';
+    std::cout << "  second: " << result.second.raw_line << '\n';
+    std::cout << "  combined latency: "
+              << (result.first.latency_ms + result.second.latency_ms)
+              << "ms\n\n";
+}
+
+PairResult interviewStyleReduction(const std::vector<Event>& events, int threshold_ms)
+{
+    // Intentionally wrong for the real-world problem:
+    // it ignores request_id and time.
+    for (std::size_t i = 0; i < events.size(); ++i) {
+        for (std::size_t j = i + 1; j < events.size(); ++j) {
+            if (events[i].latency_ms + events[j].latency_ms > threshold_ms) {
+                return PairResult{true, events[i], events[j]};
+            }
+        }
+    }
+
+    return PairResult{};
+}
+
+PairResult realSlidingWindowDetection(const std::vector<Event>& events,
+                                      int threshold_ms,
+                                      std::int64_t window_ms)
+{
+    std::unordered_map<std::string, std::deque<Event> > active_events;
+
+    for (const Event& current : events) {
+        std::deque<Event>& bucket = active_events[current.request_id];
+
+        while (!bucket.empty() &&
+               (current.timestamp_ms - bucket.front().timestamp_ms) > window_ms) {
+            bucket.pop_front();
+        }
+
+        for (const Event& previous : bucket) {
+            const std::int64_t delta = current.timestamp_ms - previous.timestamp_ms;
+
+            if (delta >= 0 && delta <= window_ms &&
+                previous.latency_ms + current.latency_ms > threshold_ms) {
+                return PairResult{true, previous, current};
+            }
+        }
+
+        bucket.push_back(current);
+    }
+
+    return PairResult{};
+}
+
+void printEvents(const std::vector<Event>& events)
+{
+    std::cout << "Synthetic log stream:\n";
+    for (const Event& event : events) {
+        std::cout << "  " << event.raw_line << '\n';
+    }
+    std::cout << '\n';
+}
+
+int main()
+{
+    try {
+        const std::vector<std::string> raw_logs = {
+            "2026-04-16T10:15:01.100Z service=api    event=parse_input   latency=12ms request_id=req-1001",
+            "2026-04-16T10:15:01.110Z service=cache  event=cache_miss    latency=48ms request_id=req-1001",
+            "2026-04-16T10:15:01.120Z service=auth   event=token_check   latency=58ms request_id=req-2001",
+            "2026-04-16T10:15:01.130Z service=db     event=read_user     latency=43ms request_id=req-3001",
+            "2026-04-16T10:15:01.135Z service=db     event=read_user     latency=55ms request_id=req-1001",
+            "2026-04-16T10:15:01.144Z service=net    event=external_call latency=47ms request_id=req-1001",
+            "2026-04-16T10:15:01.200Z service=cache  event=cache_miss    latency=60ms request_id=req-3001",
+            "2026-04-16T10:15:01.260Z service=net    event=external_call latency=52ms request_id=req-3001"
+        };
+
+        const int threshold_ms = 100;
+        const std::int64_t window_ms = 20;
+
+        const std::vector<Event> events = parseLogs(raw_logs);
+
+        printEvents(events);
+
+        std::cout << "Threshold: " << threshold_ms << "ms\n";
+        std::cout << "Time window: " << window_ms << "ms\n\n";
+
+        const PairResult naive_result = interviewStyleReduction(events, threshold_ms);
+        printPair(naive_result, "Interview-style reduction (ignores request_id and time):");
+
+        const PairResult real_result = realSlidingWindowDetection(events, threshold_ms, window_ms);
+        printPair(real_result, "Streaming sliding-window detection:");
+
+        std::cout << "Notes:\n";
+        std::cout << "  - The interview-style version can produce a false correlation.\n";
+        std::cout << "  - In this dataset, it first matches 58ms from req-2001 with 43ms from req-3001.\n";
+        std::cout << "  - That pair exceeds the threshold, but it is operationally meaningless.\n";
+        std::cout << "  - The streaming version only correlates events from the same request_id\n";
+        std::cout << "    and only within the configured time window.\n";
+
+        return 0;
+    }
+    catch (const std::exception& ex) {
+        std::cerr << "Error: " << ex.what() << '\n';
+        return 1;
+    }
+}