![]() |
VOOZH | about |
We’re so glad you’re here. You can expect all the best TNS content to arrive Monday through Friday to keep you on top of the news and at the top of your game.
Check your inbox for a confirmation email where you can adjust your preferences and even join additional groups.
Follow TNS on your favorite social media networks.
Become a TNS follower on LinkedIn.
Check out the latest featured and trending stories while you wait for your first TNS newsletter.
-- Detect when current CPU usage deviates significantly from historical patterns SELECT time, cpu_usage, mean(cpu_usage) OVER (ORDER BY time ROWS BETWEEN 288 PRECEDING AND CURRENT ROW) as baseline, stddev(cpu_usage) OVER (ORDER BY time ROWS BETWEEN 288 PRECEDING AND CURRENT ROW) as std_dev FROM cpu_metrics WHERE cpu_usage > baseline + (2 * std_dev)
-- Find relationships between database query time and application response time SELECT CORR(db_query_time, app_response_time) as correlation FROM metrics WHERE time > now() - interval '24 hours' GROUP BY time(5m)
-- Calculate 99.9% availability over rolling 30-day periods WITH daily_stats AS ( SELECT day, 100.0 * SUM(CASE WHEN response_time < 1000 THEN 1 ELSE 0 END) / COUNT(*) AS daily_avail FROM ( SELECT date_bin(INTERVAL '1 day', time, TIMESTAMP '1970-01-01T00:00:00Z') AS day, response_time FROM service_metrics WHERE time >= now() - INTERVAL '60 days' ) GROUP BY day ) SELECT day, AVG(daily_avail) OVER ( ORDER BY day RANGE BETWEEN INTERVAL '29 days' PRECEDING AND CURRENT ROW ) AS avail_30d_avg FROM daily_stats WHERE avail_30d_avg < 99.9 ORDER BY day;
def process_writes(influxdb3_local, table_batches, args=None):
"""
Process incoming metrics data and generate alerts with de-duplication
to prevent alert storms.
This plugin:
1. Monitors incoming metrics for threshold violations
2. Uses the in-memory cache to track alert states
3. Implements cooldown periods to prevent alert storms
4. Writes alert events to an 'alerts' table
"""
# Get configuration from trigger arguments or use defaults
threshold = float(args.get("threshold", "90"))
cooldown_seconds = int(args.get("cooldown_seconds", "300")) # 5 minutes default
metric_table = args.get("metric_table", "cpu_metrics")
metric_field = args.get("metric_field", "usage_percent")
alert_type = args.get("alert_type", "high_value")
for table_batch in table_batches:
table_name = table_batch["table_name"]
# Check if this table matches our configured metric table
if table_name != metric_table:
continue
for row in table_batch["rows"]:
# Check if we have the necessary fields
if "host" not in row["tags"] or metric_field not in row["fields"]:
continue
host = row["tags"]["host"]
value = row["fields"][metric_field]
timestamp = row["timestamp"]
# Check if the metric exceeds our threshold
if value > threshold:
# Construct a unique alert ID
alert_id = f"{host}:{alert_type}"
# Check if we're in a cooldown period for this alert
last_alert_time = influxdb3_local.cache.get(alert_id)
current_time = timestamp / 1_000_000_000 # Convert ns to seconds
if last_alert_time is None or (current_time - last_alert_time > cooldown_seconds):
# We're not in a cooldown period, so generate a new alert
influxdb3_local.info(f"{alert_type} alert for {host}: {value} (threshold: {threshold})")
# Store the alert time in cache
influxdb3_local.cache.put(alert_id, current_time)
# Create an alert record
line = LineBuilder("alerts")
line.tag("host", host)
line.tag("alert_type", alert_type)
line.tag("metric_table", metric_table)
line.tag("metric_field", metric_field)
line.float64_field("threshold", threshold)
line.float64_field("value", value)
line.string_field("message", f"{metric_field} exceeded threshold: {value}")
line.time_ns(timestamp)
# Write the alert to the database
influxdb3_local.write(line)
else:
# We're in a cooldown period, log this but don't generate a new alert
cooldown_remaining = cooldown_seconds - (current_time - last_alert_time)
influxdb3_local.info(
f"Suppressing duplicate {alert_type} alert for {host}: {value} "
f"(cooldown: {int(cooldown_remaining)}s remaining)"
)
threshold = float(args.get("threshold", "90"))
cooldown_seconds = int(args.get("cooldown_seconds", "300")) # 5 minutes default
metric_table = args.get("metric_table", "cpu_metrics")
metric_field = args.get("metric_field", "usage_percent")
alert_type = args.get("alert_type", "high_value")
alert_id = f"{host}:{alert_type}"
last_alert_time = influxdb3_local.cache.get(alert_id) current_time = timestamp / 1_000_000_000 # Convert ns to seconds if last_alert_time is None or (current_time - last_alert_time > cooldown_seconds): # Generate alert and update cache influxdb3_local.cache.put(alert_id, current_time) # ... else: # Suppress duplicate alert # ...
line = LineBuilder("alerts")
line.tag("host", host)
line.tag("alert_type", alert_type)
# ...
influxdb3_local.write(line)
alert_deduplication.py and creating triggers for different metrics:
# CPU monitoring trigger influxdb3 create trigger \ --trigger-spec "table:system_metrics" \ --plugin-filename "alert_deduplication.py" \ --trigger-arguments threshold=95,cooldown_seconds=600,metric_table=system_metrics,metric_field=cpu_usage,alert_type=high_cpu \ --database monitoring \ cpu_alert_handler # Memory monitoring trigger influxdb3 create trigger \ --trigger-spec "table:memory_metrics" \ --plugin-filename "alert_deduplication.py" \ --trigger-arguments threshold=85,cooldown_seconds=300,metric_table=memory_metrics,metric_field=memory_usage,alert_type=high_memory \ --database monitoring \ memory_alert_handler
# Adjust cooldown period based on severity severity = calculate_severity(value, threshold) adjusted_cooldown = cooldown_seconds * (1 - severity/100) # Shorter cooldown for more severe issues influxdb3_local.cache.put(alert_id, current_time, ttl=adjusted_cooldown)
# Get alert count from cache
alert_count = influxdb3_local.cache.get(f"{alert_id}:count", default=0)
alert_count += 1
influxdb3_local.cache.put(f"{alert_id}:count", alert_count)
# Escalate if this problem has triggered multiple alerts
if alert_count > 3:
line.tag("priority", "high")
line.string_field("message", f"ESCALATED: {message} (occurred {alert_count} times)")