observability-setup

Observability Setup Skill

Safety Notice

This listing is imported from skills.sh public index metadata. Review upstream SKILL.md and repository scripts before running.

Copy this and send it to your AI assistant to learn

Install skill "observability-setup" with this command: npx skills add navikt/copilot/navikt-copilot-observability-setup

Observability Setup Skill

This skill provides patterns for setting up observability in Nais applications.

Required Health Endpoints

import io.ktor.server.application.* import io.ktor.server.response.* import io.ktor.server.routing.* import io.ktor.http.*

fun Application.configureHealthEndpoints( dataSource: HikariDataSource, kafkaProducer: KafkaProducer<String, String> ) { routing { get("/isalive") { call.respondText("Alive", ContentType.Text.Plain) }

    get("/isready") {
        val databaseHealthy = checkDatabase(dataSource)
        val kafkaHealthy = checkKafka(kafkaProducer)

        if (databaseHealthy &#x26;&#x26; kafkaHealthy) {
            call.respondText("Ready", ContentType.Text.Plain)
        } else {
            call.respondText(
                "Not ready",
                ContentType.Text.Plain,
                HttpStatusCode.ServiceUnavailable
            )
        }
    }
}

}

fun checkDatabase(dataSource: HikariDataSource): Boolean { return try { dataSource.connection.use { it.isValid(1) } } catch (e: Exception) { false } }

fun checkKafka(producer: KafkaProducer<String, String>): Boolean { return try { producer.partitionsFor("health-check-topic").isNotEmpty() } catch (e: Exception) { false } }

Prometheus Metrics Setup

import io.micrometer.core.instrument.Clock import io.micrometer.core.instrument.binder.jvm.* import io.micrometer.prometheus.PrometheusConfig import io.micrometer.prometheus.PrometheusMeterRegistry import io.prometheus.client.CollectorRegistry import io.ktor.server.metrics.micrometer.* import io.ktor.server.response.* import io.ktor.http.*

val meterRegistry = PrometheusMeterRegistry( PrometheusConfig.DEFAULT, CollectorRegistry.defaultRegistry, Clock.SYSTEM )

fun Application.configureMetrics() { install(MicrometerMetrics) { registry = meterRegistry // Production pattern from navikt/ao-oppfolgingskontor meterBinders = listOf( JvmMemoryMetrics(), // Heap, non-heap memory JvmGcMetrics(), // Garbage collection ProcessorMetrics(), // CPU usage UptimeMetrics() // Application uptime ) }

routing {
    get("/metrics") {
        call.respondText(
            meterRegistry.scrape(),
            ContentType.parse("text/plain; version=0.0.4")
        )
    }
}

}

Business Metrics

import io.micrometer.core.instrument.Counter import io.micrometer.core.instrument.Timer

class UserService(private val meterRegistry: PrometheusMeterRegistry) { private val userCreatedCounter = Counter.builder("users_created_total") .description("Total users created") .register(meterRegistry)

private val userCreationTimer = Timer.builder("user_creation_duration_seconds")
    .description("User creation duration")
    .register(meterRegistry)

fun createUser(user: User) {
    userCreationTimer.record {
        repository.save(user)
    }
    userCreatedCounter.increment()
}

}

OpenTelemetry Tracing

Nais enables OpenTelemetry auto-instrumentation by default. For manual spans:

import io.opentelemetry.api.GlobalOpenTelemetry import io.opentelemetry.api.trace.Span import io.opentelemetry.api.trace.StatusCode

val tracer = GlobalOpenTelemetry.getTracer("my-app")

fun processPayment(paymentId: String) { val span = tracer.spanBuilder("processPayment") .setAttribute("payment.id", paymentId) .startSpan()

try {
    // Business logic
    val payment = repository.findPayment(paymentId)
    span.setAttribute("payment.amount", payment.amount)

    processPaymentInternal(payment)
    span.setStatus(StatusCode.OK)
} catch (e: Exception) {
    span.setStatus(StatusCode.ERROR, "Payment processing failed")
    span.recordException(e)
    throw e
} finally {
    span.end()
}

}

Structured Logging

import mu.KotlinLogging import net.logstash.logback.argument.StructuredArguments.kv

private val logger = KotlinLogging.logger {}

fun processOrder(orderId: String) { logger.info( "Processing order", kv("order_id", orderId), kv("timestamp", LocalDateTime.now()) )

try {
    orderService.process(orderId)

    logger.info(
        "Order processed successfully",
        kv("order_id", orderId)
    )
} catch (e: Exception) {
    logger.error(
        "Order processing failed",
        kv("order_id", orderId),
        kv("error", e.message),
        e
    )
    throw e
}

}

Nais Manifest

apiVersion: nais.io/v1alpha1 kind: Application metadata: name: my-app namespace: myteam labels: team: myteam spec: image: ghcr.io/navikt/my-app:latest port: 8080

Health checks

liveness: path: /isalive initialDelay: 10 timeout: 1 periodSeconds: 10 failureThreshold: 3

readiness: path: /isready initialDelay: 10 timeout: 1 periodSeconds: 10 failureThreshold: 3

Prometheus scraping

prometheus: enabled: true path: /metrics

OpenTelemetry auto-instrumentation

observability: autoInstrumentation: enabled: true runtime: java # Instruments Ktor, JDBC, Kafka automatically logging: destinations: - id: loki # Automatic Loki shipping - id: team-logs # Optional: private team logs

Resources (for metrics alerting)

resources: limits: memory: 512Mi requests: cpu: 50m memory: 256Mi

Alert Configuration

Create .nais/alert.yml :

apiVersion: nais.io/v1 kind: Alert metadata: name: my-app-alerts namespace: myteam labels: team: myteam spec: receivers: slack: channel: "#team-alerts" prependText: "@here " alerts: - alert: HighErrorRate expr: | (sum(rate(http_requests_total{app="my-app",status=~"5.."}[5m])) / sum(rate(http_requests_total{app="my-app"}[5m]))) > 0.05 for: 5m description: "Error rate is {{ $value | humanizePercentage }}" action: "Check logs in Grafana Loki" documentation: https://teamdocs/runbooks/high-error-rate sla: "Respond within 15 minutes" severity: critical

- alert: HighResponseTime
  expr: |
    histogram_quantile(0.95,
      rate(http_request_duration_seconds_bucket{app="my-app"}[5m])
    ) > 1
  for: 10m
  description: "95th percentile response time is {{ $value }}s"
  action: "Check Tempo traces for slow requests"
  severity: warning

- alert: PodCrashLooping
  expr: |
    rate(kube_pod_container_status_restarts_total{
      pod=~"my-app-.*"
    }[15m]) > 0
  for: 5m
  description: "Pod {{ $labels.pod }} is crash looping"
  action: "Check logs: kubectl logs {{ $labels.pod }}"
  severity: critical

- alert: HighMemoryUsage
  expr: |
    (container_memory_working_set_bytes{app="my-app"}
    / container_spec_memory_limit_bytes{app="my-app"}) > 0.9
  for: 10m
  description: "Memory usage is {{ $value | humanizePercentage }}"
  action: "Check for memory leaks, increase limits if needed"
  severity: warning

Complete Example

import io.ktor.server.application.* import io.ktor.server.engine.* import io.ktor.server.netty.* import io.micrometer.core.instrument.Timer import io.opentelemetry.api.GlobalOpenTelemetry import io.opentelemetry.api.trace.StatusCode

fun main() { val env = Environment.from(System.getenv()) val dataSource = createDataSource(env.databaseUrl)

// Run database migrations
runMigrations(dataSource)

// Setup metrics
val meterRegistry = setupMetrics()

embeddedServer(Netty, port = 8080) {
    configureHealthEndpoints(dataSource)
    configureMetrics(meterRegistry)
    configureRouting(dataSource, meterRegistry)
}.start(wait = true)

}

fun Application.configureRouting( dataSource: HikariDataSource, meterRegistry: PrometheusMeterRegistry ) { val tracer = GlobalOpenTelemetry.getTracer("my-app")

routing {
    get("/api/users") {
        val requestTimer = Timer.sample()
        val requestCounter = meterRegistry.counter(
            "http_requests_total",
            "method", "GET",
            "endpoint", "/api/users"
        )

        val span = tracer.spanBuilder("getUsersRequest")
            .setAttribute("http.method", "GET")
            .setAttribute("http.route", "/api/users")
            .startSpan()

        try {
            val users = userRepository.findAll()
            span.setAttribute("user.count", users.size.toLong())
            span.setStatus(StatusCode.OK)

            requestCounter.increment()
            requestTimer.stop(meterRegistry.timer(
                "http_request_duration_seconds",
                "method", "GET",
                "endpoint", "/api/users",
                "status", "200"
            ))

            call.respond(users)
        } catch (e: Exception) {
            span.setStatus(StatusCode.ERROR, "Failed to get users")
            span.recordException(e)

            meterRegistry.counter(
                "http_requests_total",
                "method", "GET",
                "endpoint", "/api/users",
                "status", "500"
            ).increment()

            logger.error(
                "Failed to get users",
                kv("trace_id", span.spanContext.traceId),
                kv("span_id", span.spanContext.spanId),
                e
            )

            throw e
        } finally {
            span.end()
        }
    }
}

}

Grafana Dashboard Example

Create a dashboard in Grafana with these panels:

Panel 1: Request Rate

sum(rate(http_requests_total{app="my-app"}[5m])) by (endpoint)

Panel 2: Error Rate

sum(rate(http_requests_total{app="my-app",status=~"5.."}[5m])) / sum(rate(http_requests_total{app="my-app"}[5m])) * 100

Panel 3: Response Time (p50, p95, p99)

histogram_quantile(0.50, rate(http_request_duration_seconds_bucket{app="my-app"}[5m])) histogram_quantile(0.95, rate(http_request_duration_seconds_bucket{app="my-app"}[5m])) histogram_quantile(0.99, rate(http_request_duration_seconds_bucket{app="my-app"}[5m]))

Panel 4: Memory Usage

container_memory_working_set_bytes{app="my-app"} / container_spec_memory_limit_bytes{app="my-app"} * 100

Panel 5: Database Connections

hikaricp_connections_active{app="my-app"} hikaricp_connections_max{app="my-app"}

Panel 6: Kafka Consumer Lag

kafka_consumer_lag{app="my-app"}

Loki Query Examples

View logs in Grafana Loki Explorer:

All logs from your app

{app="my-app", namespace="myteam"}

Only errors

{app="my-app"} |= "ERROR"

JSON logs with specific field

{app="my-app"} | json | event_type="payment_processed"

Logs correlated with trace

{app="my-app"} | json | trace_id="abc123def456"

Count errors per minute

sum(rate({app="my-app"} |= "ERROR" [1m])) by (pod)

Tempo Trace Search

View traces in Grafana Tempo:

  • Open Grafana → Explore

  • Select Tempo data source

  • Query by:

  • Service name: my-app

  • Operation: getUsersRequest

  • Duration: > 1s

  • Status: error

Or link from logs by clicking trace_id in Loki.

Monitoring Checklist

  • /isalive endpoint implemented

  • /isready endpoint with dependency checks (database, Kafka)

  • /metrics endpoint exposing Prometheus metrics

  • Health checks configured in Nais manifest

  • Business metrics instrumented (counters, timers, gauges)

  • Structured logging with correlation IDs (trace_id, span_id)

  • OpenTelemetry auto-instrumentation enabled in Nais manifest

  • Alert rules created in .nais/alert.yml

  • Slack channel configured for alerts

  • Grafana dashboard created

  • No sensitive data in logs or metrics (verify in Grafana)

  • High-cardinality labels avoided (no user_ids, transaction_ids)

Production Patterns from navikt

Based on 177+ repositories using observability setup:

JVM Metrics Binders (navikt/ao-oppfolgingskontor)

import io.micrometer.core.instrument.binder.jvm.*

install(MicrometerMetrics) { registry = meterRegistry meterBinders = listOf( JvmMemoryMetrics(), // Heap, non-heap, buffer pool metrics JvmGcMetrics(), // GC pause time, count ProcessorMetrics(), // CPU usage UptimeMetrics() // Application uptime ) }

Common Counter Patterns

// From dp-rapportering: Track business events val eventsProcessed = Counter.builder("events_processed_total") .description("Total events processed") .tag("event_type", "rapportering_innsendt") .tag("status", "ok") .register(meterRegistry)

// From dp-rapportering: Track API errors val apiErrors = Counter.builder("api_errors_total") .description("Total API errors") .tag("endpoint", "/api/rapporteringsperioder") .tag("error_type", "validation_error") .register(meterRegistry)

Timer Patterns

// From dp-rapportering: Measure HTTP call duration suspend fun <T> timedAction(navn: String, block: suspend () -> T): T { val (result, duration) = measureTimedValue { block() } Timer.builder("http_timer") .tag("navn", navn) .description("HTTP call duration") .register(meterRegistry) .record(duration.inWholeMilliseconds, MILLISECONDS) return result }

DORA Metrics Examples

Track DORA metrics for your team:

// Deployment frequency val deployments = Counter.builder("deployments_total") .description("Total deployments") .tag("team", "myteam") .tag("environment", "production") .register(meterRegistry)

// Lead time for changes (commit to deploy) val leadTime = Timer.builder("deployment_lead_time_seconds") .description("Time from commit to deployment") .tag("team", "myteam") .register(meterRegistry)

// Change failure rate val failedDeployments = Counter.builder("deployments_failed_total") .description("Total failed deployments") .tag("team", "myteam") .register(meterRegistry)

// Time to restore service val incidentResolutionTime = Timer.builder("incident_resolution_duration_seconds") .description("Time to resolve incidents") .tag("team", "myteam") .tag("severity", "critical") .register(meterRegistry)

Alert on DORA metrics:

  • alert: LowDeploymentFrequency expr: | sum(increase(deployments_total{team="myteam",environment="production"}[7d])) < 5 description: "Only {{ $value }} deployments in last 7 days (target: >1/day)" severity: info

  • alert: HighChangeFailureRate expr: | sum(rate(deployments_failed_total{team="myteam"}[7d])) / sum(rate(deployments_total{team="myteam"}[7d]))

    0.15 description: "Change failure rate is {{ $value | humanizePercentage }} (target: <15%)" severity: warning

See https://dora.dev for benchmarks and best practices.

Source Transparency

This detail page is rendered from real SKILL.md content. Trust labels are metadata-based hints, not a safety guarantee.

Related Skills

Related by shared tags or category signals.

General

aksel-spacing

No summary provided by upstream source.

Repository SourceNeeds Review
General

kotlin-app-config

No summary provided by upstream source.

Repository SourceNeeds Review
General

flyway-migration

No summary provided by upstream source.

Repository SourceNeeds Review