openresty配置prometheus监控

# nginx.conf配置
user nginx nginx;
worker_processes auto;
worker_rlimit_nofile 65535;
worker_cpu_affinity auto;

error_log /usr/local/openresty/nginx/logs/error.log error;
pid /run/openresty.pid;

events {
    worker_connections 10240;
    use epoll;
    multi_accept on;
    accept_mutex off;
}

http {
    # 基础配置
    include /usr/local/openresty/nginx/conf/mime.types;
    default_type application/octet-stream;
    
    charset utf-8;
    server_tokens off;

    # 日志格式
    log_format prometheus escape=json '$remote_addr - $remote_user [$time_local] '
                            '"$request" $status $body_bytes_sent '
                            '"$http_referer" "$http_user_agent" '
                            'request_time=$request_time '
                            'server_name=$server_name '
                            'host=$host';

    log_format main_json escape=json '{'
        '"timestamp":"$time_iso8601",'
        '"remote_addr":"$remote_addr",'
        '"request_method":"$request_method",'
        '"request_uri":"$request_uri",'
        '"status":$status,'
        '"request_time":$request_time,'
        '"body_bytes_sent":$body_bytes_sent,'
        '"server_name":"$server_name",'
        '"host":"$host"'
    '}';

    access_log /usr/local/openresty/nginx/logs/access.log main_json buffer=32k flush=5s;
    error_log /usr/local/openresty/nginx/logs/error.log warn;

    # 性能优化参数
    sendfile on;
    sendfile_max_chunk 512k;
    tcp_nopush on;
    tcp_nodelay on;
    
    keepalive_timeout 30s;
    keepalive_requests 1000;
    client_header_timeout 30s;
    client_body_timeout 30s;
    send_timeout 30s;
    reset_timedout_connection on;
    
    client_max_body_size 100m;
    client_body_buffer_size 128k;
    client_header_buffer_size 4k;
    large_client_header_buffers 4 16k;
    
    types_hash_max_size 2048;
    types_hash_bucket_size 128;
    server_names_hash_bucket_size 128;
    server_names_hash_max_size 512;

    # Gzip 压缩
    gzip on;
    gzip_vary on;
    gzip_min_length 1024;
    gzip_comp_level 5;
    gzip_types text/plain text/css application/json application/javascript text/xml application/xml;

    # ========== Prometheus Lua 监控配置 ==========
    lua_shared_dict prometheus_metrics 10M;
    lua_shared_dict rate_limit 10M;

    lua_package_path "/usr/local/openresty/site/lualib/?.lua;/usr/local/openresty/lualib/?.lua;;";

    init_by_lua_block {
        _G.prometheus_lib = require("prometheus")
        ngx.log(ngx.NOTICE, "Prometheus library loaded in init_by_lua")
    }

    init_worker_by_lua_block {
        ngx.log(ngx.NOTICE, "Starting init_worker_by_lua")
        
        local prometheus = _G.prometheus_lib
        if not prometheus then
            ngx.log(ngx.ERR, "Prometheus library not available")
            return
        end
        
        local ok, prom_instance = pcall(prometheus.init, "prometheus_metrics", {
            error_metric_name = "nginx_metric_errors_total",
            sync_interval = 1
        })
        if not ok then
            ngx.log(ngx.ERR, "Failed to init prometheus: ", prom_instance)
            return
        end
        
        ngx.log(ngx.NOTICE, "Prometheus initialized successfully")
        
        local function create_metrics(premature)
            if premature then return end
            
            ngx.log(ngx.NOTICE, "Creating metrics in timer")
            local metrics = {}
            
            -- 请求计数
            local ok1, requests = pcall(prom_instance.counter, prom_instance,
                "nginx_http_requests_total",
                "Total number of HTTP requests",
                {"server_name", "port", "status", "method", "scheme"}
            )
            if ok1 then metrics.requests = requests end
            
            -- 请求延迟
            local ok2, request_duration = pcall(prom_instance.histogram, prom_instance,
                "nginx_http_request_duration_seconds",
                "HTTP request duration in seconds",
                {"server_name", "port", "method"},
                {0.1, 0.5, 1, 2, 5, 10, 30}
            )
            if ok2 then metrics.request_duration = request_duration end
            
            -- 响应大小
            local ok3, response_size = pcall(prom_instance.histogram, prom_instance,
                "nginx_http_response_size_bytes",
                "HTTP response size in bytes",
                {"server_name", "port"},
                {100, 1000, 10000, 100000, 1000000, 10000000}
            )
            if ok3 then metrics.response_size = response_size end
            
            -- 连接数（全局）
            local ok4, connections = pcall(prom_instance.gauge, prom_instance,
                "nginx_connections",
                "Number of HTTP connections",
                {"state"}
            )
            if ok4 then metrics.connections = connections end
            
            -- SSL 请求计数
            local ok5, ssl_requests = pcall(prom_instance.counter, prom_instance,
                "nginx_http_ssl_requests_total",
                "Total number of SSL HTTP requests",
                {"server_name", "port"}
            )
            if ok5 then metrics.ssl_requests = ssl_requests end
            
            -- upstream 响应时间
            local ok6, upstream_response = pcall(prom_instance.histogram, prom_instance,
                "nginx_upstream_response_time_seconds",
                "Upstream response time in seconds",
                {"server_name", "port"},
                {0.1, 0.5, 1, 2, 5, 10, 30}
            )
            if ok6 then metrics.upstream_response = upstream_response end
            
            -- 请求长度
            local ok7, request_length = pcall(prom_instance.histogram, prom_instance,
                "nginx_http_request_length_bytes",
                "HTTP request length in bytes",
                {"server_name", "port"},
                {100, 1000, 10000, 100000, 1000000}
            )
            if ok7 then metrics.request_length = request_length end
            
            _G.prometheus = prom_instance
            _G.prometheus_metrics = metrics
            
            ngx.log(ngx.NOTICE, "All metrics created successfully")
        end
        
        local ok, err = ngx.timer.at(1, create_metrics)
        if not ok then
            ngx.log(ngx.ERR, "Failed to create timer: ", err)
        end
    }
    # =============================================

    # 监控服务器 - 用于暴露 metrics
    server {
        listen 127.0.0.1:28080;
        server_name localhost;
        access_log off;
        
        location /metrics {
            content_by_lua_block {
                local prometheus = _G.prometheus
                local metrics = _G.prometheus_metrics
                if not prometheus or not metrics then
                    ngx.status = 500
                    ngx.say("# Prometheus not initialized")
                    return
                end
                
                -- 更新全局连接数
                if metrics.connections then
                    local reading = tonumber(ngx.var.connections_reading) or 0
                    local writing = tonumber(ngx.var.connections_writing) or 0
                    local waiting = tonumber(ngx.var.connections_waiting) or 0
                    pcall(metrics.connections.set, metrics.connections, reading, {"reading"})
                    pcall(metrics.connections.set, metrics.connections, writing, {"writing"})
                    pcall(metrics.connections.set, metrics.connections, waiting, {"waiting"})
                    pcall(metrics.connections.set, metrics.connections, reading + writing, {"active"})
                end
                
                prometheus:collect()
            }
            allow 127.0.0.1;
            deny all;
        }

        location /health {
            default_type application/json;
            content_by_lua_block {
                local prometheus = _G.prometheus
                local metrics = _G.prometheus_metrics
                
                local health = {
                    status = "healthy",
                    timestamp = os.date("%Y-%m-%dT%H:%M:%S%z"),
                    prometheus = prometheus and "enabled" or "initializing",
                    metrics_created = metrics and "yes" or "no"
                }
                
                local ok, cjson = pcall(require, "cjson")
                if ok then
                    ngx.say(cjson.encode(health))
                else
                    ngx.say('{"status":"'..health.status..'","prometheus":"'..health.prometheus..'"}')
                end
            }
            allow 127.0.0.1;
            deny all;
        }

        location /debug {
            default_type text/plain;
            content_by_lua_block {
                ngx.say("=== Prometheus Debug Info ===\n")
                ngx.say("prometheus instance: ", _G.prometheus and "exists" or "nil")
                ngx.say("metrics table: ", _G.prometheus_metrics and "exists" or "nil")
                
                if _G.prometheus_metrics then
                    ngx.say("\nAvailable metrics:")
                    for name, _ in pairs(_G.prometheus_metrics) do
                        ngx.say("  - ", name)
                    end
                end
                
                ngx.say("\nNginx connections:")
                ngx.say("  reading: ", ngx.var.connections_reading or 0)
                ngx.say("  writing: ", ngx.var.connections_writing or 0)
                ngx.say("  waiting: ", ngx.var.connections_waiting or 0)
            }
            allow 127.0.0.1;
            deny all;
        }
    }

    include /usr/local/openresty/nginx/conf/conf.d/*.conf;
}

# 需要监控server的配置，以代理grafana为例
server {
    listen 8443 ssl;
    server_name grafana.local;

    ssl_certificate /usr/local/openresty/nginx/conf/sz.crt;
    ssl_certificate_key /usr/local/openresty/nginx/conf/sz.key;
    ssl_session_timeout 5m;
    ssl_ciphers HIGH:!aNULL:!MD5;
    ssl_prefer_server_ciphers on;
    
    log_by_lua_block {
        local metrics = _G.prometheus_metrics
        if not metrics then
            return
        end
        
        local server_name = "grafana"
        local port = ngx.var.server_port or "8443"
        local status = ngx.var.status or "0"
        local method = ngx.var.request_method or "UNKNOWN"
        local scheme = ngx.var.scheme or "https"
        local request_time = tonumber(ngx.var.request_time) or 0
        local body_bytes_sent = tonumber(ngx.var.body_bytes_sent) or 0
        local request_length = tonumber(ngx.var.request_length) or 0
        
        -- 处理 upstream_response_time（可能为逗号分隔，取第一个值）
        local upstream_response_str = ngx.var.upstream_response_time or ""
        local upstream_response_time = 0
        if upstream_response_str ~= "" then
            local first = string.match(upstream_response_str, "^([^,]+)")
            upstream_response_time = tonumber(first) or 0
        end
        
        -- 请求计数
        if metrics.requests then
            pcall(metrics.requests.inc, metrics.requests, 1,
                  {server_name, port, status, method, scheme})
        end
        
        -- SSL 请求计数（仅当 SSL 时）
        if metrics.ssl_requests and ngx.var.ssl_protocol then
            pcall(metrics.ssl_requests.inc, metrics.ssl_requests, 1,
                  {server_name, port})
        end
        
        -- 请求延迟
        if metrics.request_duration then
            pcall(metrics.request_duration.observe, metrics.request_duration,
                  request_time, {server_name, port, method})
        end
        
        -- 响应大小
        if metrics.response_size then
            pcall(metrics.response_size.observe, metrics.response_size,
                  body_bytes_sent, {server_name, port})
        end
        
        -- upstream 响应时间（仅当有代理时）
        if metrics.upstream_response and upstream_response_time > 0 then
            pcall(metrics.upstream_response.observe, metrics.upstream_response,
                  upstream_response_time, {server_name, port})
        end
        
        -- 请求长度
        if metrics.request_length then
            pcall(metrics.request_length.observe, metrics.request_length,
                  request_length, {server_name, port})
        end
    }
    
    location / {
        proxy_pass http://127.0.0.1:3000/;
        proxy_set_header Host $host;
        proxy_set_header X-Real-IP $remote_addr;
        proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
        proxy_set_header X-Forwarded-Proto $scheme;
        
        proxy_connect_timeout 60s;
        proxy_send_timeout 60s;
        proxy_read_timeout 60s;
    }
    
    location /api/live/ws {
        proxy_pass http://127.0.0.1:3000/api/live/ws/;
        proxy_http_version 1.1;
        proxy_set_header Upgrade $http_upgrade;
        proxy_set_header Connection "Upgrade";
        proxy_set_header Host $host;
        proxy_set_header X-Real-IP $remote_addr;
        proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
        proxy_set_header X-Forwarded-Proto $scheme;
        
        proxy_connect_timeout 7d;
        proxy_send_timeout 7d;
        proxy_read_timeout 7d;
    }
}

# 安装prometheus支持lua脚本
/usr/local/openresty/bin/opm get knyar/nginx-lua-prometheus

Share

openresty配置prometheus监控