Files
notes/PhenixRTS/frontend-draining/19 days draining.md
2025-11-25 21:38:17 -05:00

6.2 KiB

Hostname: frontend-us-northeast-3-vm4w InstanceId: us-northeast#us-east4-c.Iqb8nNAA

DECLARE
  hostName STRING DEFAULT "frontend-us-northeast-3-vm4w";
DECLARE
  lookbackDays INT64 DEFAULT 41;
DECLARE
  start_time TIMESTAMP DEFAULT TIMESTAMP_ADD(CURRENT_TIMESTAMP(), INTERVAL -lookbackDays DAY);
  
  -----------------------------------------------------------------
  -- Step 1: Find the most recent log message indicating draining connections for a specific host
WITH LatestDrainLog AS ( 
  SELECT Message 
  FROM `phenix-pcast.pcast_logs_us.syslog` 
  WHERE 
	  Timestamp > start_time 
	  AND Facility = 'platform' 
	  AND HostName = hostName 
	  AND Message LIKE 'Websocket connectionids preventing drain%' 
	  ORDER BY Timestamp DESC LIMIT 1 
-- Step 2: Extract all connection IDs from that single log message 
DrainingConnectionIds AS (
	SELECT connectionId 
	FROM 
		LatestDrainLog, 
		UNNEST(REGEXP_EXTRACT_ALL(Message, r"'([^']*)'")) AS connectionId )
),
-- Step 3: Find all logs that associate sessions with connections
SessionConnections AS (
	Timestamp
	SELECT 
		REGEXP_EXTRACT(Message, r'\] \[(.*?)\] Session started with connection') AS sessionId, 
		REGEXP_EXTRACT(Message, r'connection \[(.*?)\] and roles') AS connectionId, 
	FROM `phenix-pcast.pcast_logs_us.syslog` 
	WHERE Timestamp > start_time 
		AND Facility = 'platform' 
		AND Message LIKE '%Session started with connection%' 
		AND REGEXP_EXTRACT(Message, r'\] \[(.*?)\] Session started with connection') IS NOT NULL 
		AND REGEXP_EXTRACT(Message, r'connection \[(.*?)\] and roles') IS NOT NULL
	),
	-- Pattern 2: "Session [sessionId] has a new connection [connectionId], previously [oldConnectionId]" 
SessionNewConnections AS ( 
SELECT 
	Timestamp
	REGEXP_EXTRACT(Message, r'Session \[(.*?)\] has a new connection') AS sessionId,
	REGEXP_EXTRACT(Message, r'connection \[(.*?)\], previously') AS connectionId 
FROM `phenix-pcast.pcast_logs_us.syslog`
WHERE 
	Timestamp > start_time 
	AND Facility = 'platform' 
	AND Message LIKE '%Session%has a new connection%'
	AND REGEXP_EXTRACT(Message, r'Session \[(.*?)\] has a new connection') IS NOT NULL 
	AND REGEXP_EXTRACT(Message, r'connection \[(.*?)\], previously') IS NOT NULL 
AllSessionConnections AS ( 
	SELECT Timestamp, sessionId, connectionId 
	FROM SessionConnections 
		UNION DISTINCT 
	SELECT Timestamp, sessionId, connectionId 
	FROM SessionNewConnections )
)
SELECT *
FROM AllSessionConnections
ORDER BY Timestmap
	

DECLARE TargetHostName STRING DEFAULT "frontend-us-northeast-3-vm4w";
DECLARE TargetInstanceId STRING DEFAULT "us-northeast#us-east4-c.Iqb8nNAA";
DECLARE TargetConnectionId STRING DEFAULT "us-central#ZQiUCdymrZHbmeF12NhUZQ8xZZXxviWD";
------------
With ConnectionIdsPreventingDrain AS (
SELECT
  Message
FROM
  `phenix-pcast.pcast_logs_us.syslog`
WHERE
  Timestamp > TIMESTAMP_ADD(CURRENT_TIMESTAMP(), INTERVAL -2 MINUTE)
  AND Facility = 'platform'
  AND HostName = hostName
  AND Message LIKE 'Websocket connectionids preventing drain%'
ORDER BY
  Timestamp DESC
LIMIT
  1
  )
DECLARE HostName STRING DEFAULT "frontend-us-northeast-3-vm4w";

CREATE TEMPORARY FUNCTION GET_METRIC_VALUE(statusJson STRING, metricName STRING) RETURNS FLOAT64 AS (
  COALESCE(
    (
      SELECT
        CAST(
          JSON_EXTRACT_SCALAR(metric, '$.value') AS FLOAT64
        )
      FROM
        UNNEST(
          JSON_EXTRACT_ARRAY(JSON_EXTRACT(statusJson, '$.load'))
        ) AS metric
      WHERE
        JSON_EXTRACT_SCALAR(metric, '$.name') = metricName
      LIMIT
        1
    ),
    0
  )
);
-- WITH LatestInstanceMetricForHost AS (
  SELECT 
    Timestamp, 
    Status,
    (GET_METRIC_VALUE(Status, 'uptime/os/seconds') / 3600 ) AS UptimeHours,
    (GET_METRIC_VALUE(Status, 'status/seconds') / 3600 ) AS DrainingHours    
  FROM `phenix-pcast.pcast.InstanceMetrics`
  WHERE Timestamp > TIMESTAMP_ADD(CURRENT_TIMESTAMP(), INTERVAL -1 MINUTE)
  AND Hostname = Hostname
  QUALIFY ROW_NUMBER() OVER (PARTITION BY InstanceId ORDER BY Timestamp DESC) = 1
  ORDER BY Timestamp DESC
  LIMIT 1

DECLARE TargetInstanceId STRING DEFAULT "us-northeast#us-east4-c.Iqb8nNAA";

CREATE TEMPORARY FUNCTION GET_METRIC_VALUE(statusJson STRING, metricName STRING) RETURNS FLOAT64 AS (
  COALESCE(
    (
      SELECT
        CAST(
          JSON_EXTRACT_SCALAR(metric, '$.value') AS FLOAT64
        )
      FROM
        UNNEST(
          JSON_EXTRACT_ARRAY(JSON_EXTRACT(statusJson, '$.load'))
        ) AS metric
      WHERE
        JSON_EXTRACT_SCALAR(metric, '$.name') = metricName
      LIMIT
        1
    ),
    0


  )
);

SELECT 
  Timestamp, 
  Status,
  InstanceId,
  HostName,
  Health,
  HealthAlert,
  FORMAT('%.2f', (GET_METRIC_VALUE(Status, 'uptime/os/seconds') / 3600 )) AS UptimeHours,
  FORMAT('%.2f', (GET_METRIC_VALUE(Status, 'status/seconds') / 3600 )) AS DrainingHours,    
  GET_METRIC_VALUE(Status, 'connections/open') AS connectionsOpen,
  GET_METRIC_VALUE(Status, 'clients') AS clients,
  GET_METRIC_VALUE(Status, 'clients/subscriptions') AS clientsSubscriptions,
  GET_METRIC_VALUE(Status, 'clients/replay/events') AS clientsReplayEvents,
  GET_METRIC_VALUE(Status, 'mq/incoming/pending') AS mqIncomingPending,
  GET_METRIC_VALUE(Status, 'mq/outgoing/pending') AS mqOutgoingPending,
  GET_METRIC_VALUE(Status, 'mq/incoming/rate') AS mqIncomingRate,

FROM `phenix-pcast.pcast.InstanceMetrics`
WHERE Timestamp > TIMESTAMP_ADD(CURRENT_TIMESTAMP(), INTERVAL -1 MINUTE)
AND InstanceId = TargetInstanceId
QUALIFY ROW_NUMBER() OVER (PARTITION BY InstanceId ORDER BY Timestamp DESC) = 1
ORDER BY Timestamp DESC
LIMIT 1

Using

SELECT 
  Timestamp,
  Category,
  Severity,
  Message,
  HostName,
  Region,
  Zone,
FROM 
  `phenix-pcast.pcast_logs_us.syslog`
WHERE 
    Timestamp > TIMESTAMP_ADD(CURRENT_TIMESTAMP(), INTERVAL -90 DAY)

    AND Facility = 'platform'
    AND Service = 'frontend'
    AND Message LIKE "%Drain instance%"
    AND HostName = 'frontend-us-northeast-3-vm4w'
ORDER BY 
  Timestamp

HostName: frontend-us-northeast-3-vm4w

Went into draining 2025-11-03 19:49:37.012998 UTC - [us-northeast#us-east4-c.Iqb8nNAA] Drain instance (undoable=[false])

Skipping ping as previous ping is still pending since [1760474289916] 1760474289916 --> 2025-10-14T20:38:09.916Z