Files
notes/PhenixRTS/frontend-draining/19 days draining.md
2025-11-25 21:38:17 -05:00

228 lines
6.2 KiB
Markdown

Hostname: `frontend-us-northeast-3-vm4w`
InstanceId: `us-northeast#us-east4-c.Iqb8nNAA`
```SQL
DECLARE
hostName STRING DEFAULT "frontend-us-northeast-3-vm4w";
DECLARE
lookbackDays INT64 DEFAULT 41;
DECLARE
start_time TIMESTAMP DEFAULT TIMESTAMP_ADD(CURRENT_TIMESTAMP(), INTERVAL -lookbackDays DAY);
-----------------------------------------------------------------
-- Step 1: Find the most recent log message indicating draining connections for a specific host
WITH LatestDrainLog AS (
SELECT Message
FROM `phenix-pcast.pcast_logs_us.syslog`
WHERE
Timestamp > start_time
AND Facility = 'platform'
AND HostName = hostName
AND Message LIKE 'Websocket connectionids preventing drain%'
ORDER BY Timestamp DESC LIMIT 1
-- Step 2: Extract all connection IDs from that single log message
DrainingConnectionIds AS (
SELECT connectionId
FROM
LatestDrainLog,
UNNEST(REGEXP_EXTRACT_ALL(Message, r"'([^']*)'")) AS connectionId )
),
-- Step 3: Find all logs that associate sessions with connections
SessionConnections AS (
Timestamp
SELECT
REGEXP_EXTRACT(Message, r'\] \[(.*?)\] Session started with connection') AS sessionId,
REGEXP_EXTRACT(Message, r'connection \[(.*?)\] and roles') AS connectionId,
FROM `phenix-pcast.pcast_logs_us.syslog`
WHERE Timestamp > start_time
AND Facility = 'platform'
AND Message LIKE '%Session started with connection%'
AND REGEXP_EXTRACT(Message, r'\] \[(.*?)\] Session started with connection') IS NOT NULL
AND REGEXP_EXTRACT(Message, r'connection \[(.*?)\] and roles') IS NOT NULL
),
-- Pattern 2: "Session [sessionId] has a new connection [connectionId], previously [oldConnectionId]"
SessionNewConnections AS (
SELECT
Timestamp
REGEXP_EXTRACT(Message, r'Session \[(.*?)\] has a new connection') AS sessionId,
REGEXP_EXTRACT(Message, r'connection \[(.*?)\], previously') AS connectionId
FROM `phenix-pcast.pcast_logs_us.syslog`
WHERE
Timestamp > start_time
AND Facility = 'platform'
AND Message LIKE '%Session%has a new connection%'
AND REGEXP_EXTRACT(Message, r'Session \[(.*?)\] has a new connection') IS NOT NULL
AND REGEXP_EXTRACT(Message, r'connection \[(.*?)\], previously') IS NOT NULL
AllSessionConnections AS (
SELECT Timestamp, sessionId, connectionId
FROM SessionConnections
UNION DISTINCT
SELECT Timestamp, sessionId, connectionId
FROM SessionNewConnections )
)
SELECT *
FROM AllSessionConnections
ORDER BY Timestmap
```
------
```SQL
DECLARE TargetHostName STRING DEFAULT "frontend-us-northeast-3-vm4w";
DECLARE TargetInstanceId STRING DEFAULT "us-northeast#us-east4-c.Iqb8nNAA";
DECLARE TargetConnectionId STRING DEFAULT "us-central#ZQiUCdymrZHbmeF12NhUZQ8xZZXxviWD";
------------
With ConnectionIdsPreventingDrain AS (
SELECT
Message
FROM
`phenix-pcast.pcast_logs_us.syslog`
WHERE
Timestamp > TIMESTAMP_ADD(CURRENT_TIMESTAMP(), INTERVAL -2 MINUTE)
AND Facility = 'platform'
AND HostName = hostName
AND Message LIKE 'Websocket connectionids preventing drain%'
ORDER BY
Timestamp DESC
LIMIT
1
)
```
```SQL
DECLARE HostName STRING DEFAULT "frontend-us-northeast-3-vm4w";
CREATE TEMPORARY FUNCTION GET_METRIC_VALUE(statusJson STRING, metricName STRING) RETURNS FLOAT64 AS (
COALESCE(
(
SELECT
CAST(
JSON_EXTRACT_SCALAR(metric, '$.value') AS FLOAT64
)
FROM
UNNEST(
JSON_EXTRACT_ARRAY(JSON_EXTRACT(statusJson, '$.load'))
) AS metric
WHERE
JSON_EXTRACT_SCALAR(metric, '$.name') = metricName
LIMIT
1
),
0
)
);
-- WITH LatestInstanceMetricForHost AS (
SELECT
Timestamp,
Status,
(GET_METRIC_VALUE(Status, 'uptime/os/seconds') / 3600 ) AS UptimeHours,
(GET_METRIC_VALUE(Status, 'status/seconds') / 3600 ) AS DrainingHours
FROM `phenix-pcast.pcast.InstanceMetrics`
WHERE Timestamp > TIMESTAMP_ADD(CURRENT_TIMESTAMP(), INTERVAL -1 MINUTE)
AND Hostname = Hostname
QUALIFY ROW_NUMBER() OVER (PARTITION BY InstanceId ORDER BY Timestamp DESC) = 1
ORDER BY Timestamp DESC
LIMIT 1
```
```SQL
DECLARE TargetInstanceId STRING DEFAULT "us-northeast#us-east4-c.Iqb8nNAA";
CREATE TEMPORARY FUNCTION GET_METRIC_VALUE(statusJson STRING, metricName STRING) RETURNS FLOAT64 AS (
COALESCE(
(
SELECT
CAST(
JSON_EXTRACT_SCALAR(metric, '$.value') AS FLOAT64
)
FROM
UNNEST(
JSON_EXTRACT_ARRAY(JSON_EXTRACT(statusJson, '$.load'))
) AS metric
WHERE
JSON_EXTRACT_SCALAR(metric, '$.name') = metricName
LIMIT
1
),
0
)
);
SELECT
Timestamp,
Status,
InstanceId,
HostName,
Health,
HealthAlert,
FORMAT('%.2f', (GET_METRIC_VALUE(Status, 'uptime/os/seconds') / 3600 )) AS UptimeHours,
FORMAT('%.2f', (GET_METRIC_VALUE(Status, 'status/seconds') / 3600 )) AS DrainingHours,
GET_METRIC_VALUE(Status, 'connections/open') AS connectionsOpen,
GET_METRIC_VALUE(Status, 'clients') AS clients,
GET_METRIC_VALUE(Status, 'clients/subscriptions') AS clientsSubscriptions,
GET_METRIC_VALUE(Status, 'clients/replay/events') AS clientsReplayEvents,
GET_METRIC_VALUE(Status, 'mq/incoming/pending') AS mqIncomingPending,
GET_METRIC_VALUE(Status, 'mq/outgoing/pending') AS mqOutgoingPending,
GET_METRIC_VALUE(Status, 'mq/incoming/rate') AS mqIncomingRate,
FROM `phenix-pcast.pcast.InstanceMetrics`
WHERE Timestamp > TIMESTAMP_ADD(CURRENT_TIMESTAMP(), INTERVAL -1 MINUTE)
AND InstanceId = TargetInstanceId
QUALIFY ROW_NUMBER() OVER (PARTITION BY InstanceId ORDER BY Timestamp DESC) = 1
ORDER BY Timestamp DESC
LIMIT 1
```
Using
```SQL
SELECT
Timestamp,
Category,
Severity,
Message,
HostName,
Region,
Zone,
FROM
`phenix-pcast.pcast_logs_us.syslog`
WHERE
Timestamp > TIMESTAMP_ADD(CURRENT_TIMESTAMP(), INTERVAL -90 DAY)
AND Facility = 'platform'
AND Service = 'frontend'
AND Message LIKE "%Drain instance%"
AND HostName = 'frontend-us-northeast-3-vm4w'
ORDER BY
Timestamp
```
`HostName`: `frontend-us-northeast-3-vm4w`
Went into draining
`2025-11-03 19:49:37.012998 UTC` - `[us-northeast#us-east4-c.Iqb8nNAA] Drain instance (undoable=[false])`
`Skipping ping as previous ping is still pending since [1760474289916]`
1760474289916 --> `2025-10-14T20:38:09.916Z`