Observability Setup Skill
This skill provides patterns for setting up observability in Nais applications.
Required Health Endpoints
import io.ktor.server.application.* import io.ktor.server.response.* import io.ktor.server.routing.* import io.ktor.http.*
fun Application.configureHealthEndpoints( dataSource: HikariDataSource, kafkaProducer: KafkaProducer<String, String> ) { routing { get("/isalive") { call.respondText("Alive", ContentType.Text.Plain) }
get("/isready") {
val databaseHealthy = checkDatabase(dataSource)
val kafkaHealthy = checkKafka(kafkaProducer)
if (databaseHealthy && kafkaHealthy) {
call.respondText("Ready", ContentType.Text.Plain)
} else {
call.respondText(
"Not ready",
ContentType.Text.Plain,
HttpStatusCode.ServiceUnavailable
)
}
}
}
}
fun checkDatabase(dataSource: HikariDataSource): Boolean { return try { dataSource.connection.use { it.isValid(1) } } catch (e: Exception) { false } }
fun checkKafka(producer: KafkaProducer<String, String>): Boolean { return try { producer.partitionsFor("health-check-topic").isNotEmpty() } catch (e: Exception) { false } }
Prometheus Metrics Setup
import io.micrometer.core.instrument.Clock import io.micrometer.core.instrument.binder.jvm.* import io.micrometer.prometheus.PrometheusConfig import io.micrometer.prometheus.PrometheusMeterRegistry import io.prometheus.client.CollectorRegistry import io.ktor.server.metrics.micrometer.* import io.ktor.server.response.* import io.ktor.http.*
val meterRegistry = PrometheusMeterRegistry( PrometheusConfig.DEFAULT, CollectorRegistry.defaultRegistry, Clock.SYSTEM )
fun Application.configureMetrics() { install(MicrometerMetrics) { registry = meterRegistry // Production pattern from navikt/ao-oppfolgingskontor meterBinders = listOf( JvmMemoryMetrics(), // Heap, non-heap memory JvmGcMetrics(), // Garbage collection ProcessorMetrics(), // CPU usage UptimeMetrics() // Application uptime ) }
routing {
get("/metrics") {
call.respondText(
meterRegistry.scrape(),
ContentType.parse("text/plain; version=0.0.4")
)
}
}
}
Business Metrics
import io.micrometer.core.instrument.Counter import io.micrometer.core.instrument.Timer
class UserService(private val meterRegistry: PrometheusMeterRegistry) { private val userCreatedCounter = Counter.builder("users_created_total") .description("Total users created") .register(meterRegistry)
private val userCreationTimer = Timer.builder("user_creation_duration_seconds")
.description("User creation duration")
.register(meterRegistry)
fun createUser(user: User) {
userCreationTimer.record {
repository.save(user)
}
userCreatedCounter.increment()
}
}
OpenTelemetry Tracing
Nais enables OpenTelemetry auto-instrumentation by default. For manual spans:
import io.opentelemetry.api.GlobalOpenTelemetry import io.opentelemetry.api.trace.Span import io.opentelemetry.api.trace.StatusCode
val tracer = GlobalOpenTelemetry.getTracer("my-app")
fun processPayment(paymentId: String) { val span = tracer.spanBuilder("processPayment") .setAttribute("payment.id", paymentId) .startSpan()
try {
// Business logic
val payment = repository.findPayment(paymentId)
span.setAttribute("payment.amount", payment.amount)
processPaymentInternal(payment)
span.setStatus(StatusCode.OK)
} catch (e: Exception) {
span.setStatus(StatusCode.ERROR, "Payment processing failed")
span.recordException(e)
throw e
} finally {
span.end()
}
}
Structured Logging
import mu.KotlinLogging import net.logstash.logback.argument.StructuredArguments.kv
private val logger = KotlinLogging.logger {}
fun processOrder(orderId: String) { logger.info( "Processing order", kv("order_id", orderId), kv("timestamp", LocalDateTime.now()) )
try {
orderService.process(orderId)
logger.info(
"Order processed successfully",
kv("order_id", orderId)
)
} catch (e: Exception) {
logger.error(
"Order processing failed",
kv("order_id", orderId),
kv("error", e.message),
e
)
throw e
}
}
Nais Manifest
apiVersion: nais.io/v1alpha1 kind: Application metadata: name: my-app namespace: myteam labels: team: myteam spec: image: ghcr.io/navikt/my-app:latest port: 8080
Health checks
liveness: path: /isalive initialDelay: 10 timeout: 1 periodSeconds: 10 failureThreshold: 3
readiness: path: /isready initialDelay: 10 timeout: 1 periodSeconds: 10 failureThreshold: 3
Prometheus scraping
prometheus: enabled: true path: /metrics
OpenTelemetry auto-instrumentation
observability: autoInstrumentation: enabled: true runtime: java # Instruments Ktor, JDBC, Kafka automatically logging: destinations: - id: loki # Automatic Loki shipping - id: team-logs # Optional: private team logs
Resources (for metrics alerting)
resources: limits: memory: 512Mi requests: cpu: 50m memory: 256Mi
Alert Configuration
Create .nais/alert.yml :
apiVersion: nais.io/v1 kind: Alert metadata: name: my-app-alerts namespace: myteam labels: team: myteam spec: receivers: slack: channel: "#team-alerts" prependText: "@here " alerts: - alert: HighErrorRate expr: | (sum(rate(http_requests_total{app="my-app",status=~"5.."}[5m])) / sum(rate(http_requests_total{app="my-app"}[5m]))) > 0.05 for: 5m description: "Error rate is {{ $value | humanizePercentage }}" action: "Check logs in Grafana Loki" documentation: https://teamdocs/runbooks/high-error-rate sla: "Respond within 15 minutes" severity: critical
- alert: HighResponseTime
expr: |
histogram_quantile(0.95,
rate(http_request_duration_seconds_bucket{app="my-app"}[5m])
) > 1
for: 10m
description: "95th percentile response time is {{ $value }}s"
action: "Check Tempo traces for slow requests"
severity: warning
- alert: PodCrashLooping
expr: |
rate(kube_pod_container_status_restarts_total{
pod=~"my-app-.*"
}[15m]) > 0
for: 5m
description: "Pod {{ $labels.pod }} is crash looping"
action: "Check logs: kubectl logs {{ $labels.pod }}"
severity: critical
- alert: HighMemoryUsage
expr: |
(container_memory_working_set_bytes{app="my-app"}
/ container_spec_memory_limit_bytes{app="my-app"}) > 0.9
for: 10m
description: "Memory usage is {{ $value | humanizePercentage }}"
action: "Check for memory leaks, increase limits if needed"
severity: warning
Complete Example
import io.ktor.server.application.* import io.ktor.server.engine.* import io.ktor.server.netty.* import io.micrometer.core.instrument.Timer import io.opentelemetry.api.GlobalOpenTelemetry import io.opentelemetry.api.trace.StatusCode
fun main() { val env = Environment.from(System.getenv()) val dataSource = createDataSource(env.databaseUrl)
// Run database migrations
runMigrations(dataSource)
// Setup metrics
val meterRegistry = setupMetrics()
embeddedServer(Netty, port = 8080) {
configureHealthEndpoints(dataSource)
configureMetrics(meterRegistry)
configureRouting(dataSource, meterRegistry)
}.start(wait = true)
}
fun Application.configureRouting( dataSource: HikariDataSource, meterRegistry: PrometheusMeterRegistry ) { val tracer = GlobalOpenTelemetry.getTracer("my-app")
routing {
get("/api/users") {
val requestTimer = Timer.sample()
val requestCounter = meterRegistry.counter(
"http_requests_total",
"method", "GET",
"endpoint", "/api/users"
)
val span = tracer.spanBuilder("getUsersRequest")
.setAttribute("http.method", "GET")
.setAttribute("http.route", "/api/users")
.startSpan()
try {
val users = userRepository.findAll()
span.setAttribute("user.count", users.size.toLong())
span.setStatus(StatusCode.OK)
requestCounter.increment()
requestTimer.stop(meterRegistry.timer(
"http_request_duration_seconds",
"method", "GET",
"endpoint", "/api/users",
"status", "200"
))
call.respond(users)
} catch (e: Exception) {
span.setStatus(StatusCode.ERROR, "Failed to get users")
span.recordException(e)
meterRegistry.counter(
"http_requests_total",
"method", "GET",
"endpoint", "/api/users",
"status", "500"
).increment()
logger.error(
"Failed to get users",
kv("trace_id", span.spanContext.traceId),
kv("span_id", span.spanContext.spanId),
e
)
throw e
} finally {
span.end()
}
}
}
}
Grafana Dashboard Example
Create a dashboard in Grafana with these panels:
Panel 1: Request Rate
sum(rate(http_requests_total{app="my-app"}[5m])) by (endpoint)
Panel 2: Error Rate
sum(rate(http_requests_total{app="my-app",status=~"5.."}[5m])) / sum(rate(http_requests_total{app="my-app"}[5m])) * 100
Panel 3: Response Time (p50, p95, p99)
histogram_quantile(0.50, rate(http_request_duration_seconds_bucket{app="my-app"}[5m])) histogram_quantile(0.95, rate(http_request_duration_seconds_bucket{app="my-app"}[5m])) histogram_quantile(0.99, rate(http_request_duration_seconds_bucket{app="my-app"}[5m]))
Panel 4: Memory Usage
container_memory_working_set_bytes{app="my-app"} / container_spec_memory_limit_bytes{app="my-app"} * 100
Panel 5: Database Connections
hikaricp_connections_active{app="my-app"} hikaricp_connections_max{app="my-app"}
Panel 6: Kafka Consumer Lag
kafka_consumer_lag{app="my-app"}
Loki Query Examples
View logs in Grafana Loki Explorer:
All logs from your app
{app="my-app", namespace="myteam"}
Only errors
{app="my-app"} |= "ERROR"
JSON logs with specific field
{app="my-app"} | json | event_type="payment_processed"
Logs correlated with trace
{app="my-app"} | json | trace_id="abc123def456"
Count errors per minute
sum(rate({app="my-app"} |= "ERROR" [1m])) by (pod)
Tempo Trace Search
View traces in Grafana Tempo:
-
Open Grafana → Explore
-
Select Tempo data source
-
Query by:
-
Service name: my-app
-
Operation: getUsersRequest
-
Duration: > 1s
-
Status: error
Or link from logs by clicking trace_id in Loki.
Monitoring Checklist
-
/isalive endpoint implemented
-
/isready endpoint with dependency checks (database, Kafka)
-
/metrics endpoint exposing Prometheus metrics
-
Health checks configured in Nais manifest
-
Business metrics instrumented (counters, timers, gauges)
-
Structured logging with correlation IDs (trace_id, span_id)
-
OpenTelemetry auto-instrumentation enabled in Nais manifest
-
Alert rules created in .nais/alert.yml
-
Slack channel configured for alerts
-
Grafana dashboard created
-
No sensitive data in logs or metrics (verify in Grafana)
-
High-cardinality labels avoided (no user_ids, transaction_ids)
Production Patterns from navikt
Based on 177+ repositories using observability setup:
JVM Metrics Binders (navikt/ao-oppfolgingskontor)
import io.micrometer.core.instrument.binder.jvm.*
install(MicrometerMetrics) { registry = meterRegistry meterBinders = listOf( JvmMemoryMetrics(), // Heap, non-heap, buffer pool metrics JvmGcMetrics(), // GC pause time, count ProcessorMetrics(), // CPU usage UptimeMetrics() // Application uptime ) }
Common Counter Patterns
// From dp-rapportering: Track business events val eventsProcessed = Counter.builder("events_processed_total") .description("Total events processed") .tag("event_type", "rapportering_innsendt") .tag("status", "ok") .register(meterRegistry)
// From dp-rapportering: Track API errors val apiErrors = Counter.builder("api_errors_total") .description("Total API errors") .tag("endpoint", "/api/rapporteringsperioder") .tag("error_type", "validation_error") .register(meterRegistry)
Timer Patterns
// From dp-rapportering: Measure HTTP call duration suspend fun <T> timedAction(navn: String, block: suspend () -> T): T { val (result, duration) = measureTimedValue { block() } Timer.builder("http_timer") .tag("navn", navn) .description("HTTP call duration") .register(meterRegistry) .record(duration.inWholeMilliseconds, MILLISECONDS) return result }
DORA Metrics Examples
Track DORA metrics for your team:
// Deployment frequency val deployments = Counter.builder("deployments_total") .description("Total deployments") .tag("team", "myteam") .tag("environment", "production") .register(meterRegistry)
// Lead time for changes (commit to deploy) val leadTime = Timer.builder("deployment_lead_time_seconds") .description("Time from commit to deployment") .tag("team", "myteam") .register(meterRegistry)
// Change failure rate val failedDeployments = Counter.builder("deployments_failed_total") .description("Total failed deployments") .tag("team", "myteam") .register(meterRegistry)
// Time to restore service val incidentResolutionTime = Timer.builder("incident_resolution_duration_seconds") .description("Time to resolve incidents") .tag("team", "myteam") .tag("severity", "critical") .register(meterRegistry)
Alert on DORA metrics:
-
alert: LowDeploymentFrequency expr: | sum(increase(deployments_total{team="myteam",environment="production"}[7d])) < 5 description: "Only {{ $value }} deployments in last 7 days (target: >1/day)" severity: info
-
alert: HighChangeFailureRate expr: | sum(rate(deployments_failed_total{team="myteam"}[7d])) / sum(rate(deployments_total{team="myteam"}[7d]))
0.15 description: "Change failure rate is {{ $value | humanizePercentage }} (target: <15%)" severity: warning
See https://dora.dev for benchmarks and best practices.