一、實現方案
1、每個微服務通過/metrics接口暴露自己的指標,作為一個exporter
2、部署prometheus,同時配置serviceMonitor采集微服務的exporter地址,exporter地址通過service地址方式訪問
serviceMonitor如下:
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
name: test-monitor
namespace: monitoring
spec:
endpoints:
- honorLabels: true
interval: 10s
port: http
scheme: http
namespaceSelector:
matchNames:
- test
selector:
matchLabels:
app.kubernetes.io/group: test
|
3、監控平臺配置grafana展示監控指標,配置告警規則
二、微服務監控指標列表
| 指標組 | 指標名稱 | 指標類型 | 指標說明 |
|---|---|---|---|
| http指標 |
http_request_duration_seconds_count |
Histogram |
http接口的請求時長 |
|
http_request_total |
Counter | http接口的請求總次數 | |
|
http_response_size_bytes |
Histogram |
http接口的響應體大小 | |
|
http_requests_in_flight |
Gauge |
http接口的并發請求數 | |
| Go指標
|
go_gc_duration_seconds_sum |
Histogram | Go程序gc時長 |
go_goroutines |
Gauge |
go程序協程數量 | |
go_memstats_alloc_bytes |
Gauge |
Go程序占用的內存大小 | |
| Gprc指標 |
grpc_server_handled_total |
Counter | 每個grpc方法調用完成的請求數 |
grpc_server_handling_seconds_bucket |
Histogram |
每個grpc方法響應時長 | |
grpc_server_started_total |
Counter | 每個grpc方法調用的總次數 |
完整指標樣例:
# HELP go_gc_duration_seconds A summary of the pause duration of garbage collection cycles.
# TYPE go_gc_duration_seconds summary
go_gc_duration_seconds{quantile="0"} 2.5934e-05
go_gc_duration_seconds{quantile="0.25"} 3.062e-05
go_gc_duration_seconds{quantile="0.5"} 4.3478e-05
go_gc_duration_seconds{quantile="0.75"} 5.73e-05
go_gc_duration_seconds{quantile="1"} 0.000188399
go_gc_duration_seconds_sum 0.000660942
go_gc_duration_seconds_count 12
# HELP go_goroutines Number of goroutines that currently exist.
# TYPE go_goroutines gauge
go_goroutines 529
# HELP go_info Information about the Go environment.
# TYPE go_info gauge
go_info{version="go1.13.12"} 1
# HELP go_memstats_alloc_bytes Number of bytes allocated and still in use.
# TYPE go_memstats_alloc_bytes gauge
go_memstats_alloc_bytes 2.364408e+07
# HELP http_request_duration_seconds The latency of the HTTP requests.
# TYPE http_request_duration_seconds histogram
http_request_duration_seconds_bucket{code="200",api="/v1/common/user/filter",method="GET",service="common",le="1"} 5
http_request_duration_seconds_bucket{code="200",api="/v1/common/user/filter",method="GET",service="common",le="5"} 5
http_request_duration_seconds_bucket{code="200",api="/v1/common/user/filter",method="GET",service="common",le="10"} 5
http_request_duration_seconds_sum{code="200",api="/v1/common/user/filter",method="GET",service="common"} 1.534460968
http_request_duration_seconds_count{code="200",api="/v1/common/user/filter",method="GET",service="common"} 5
# HELP http_requests_in_flight The number of inflight requests being handled at the same time.
# TYPE http_requests_in_flight gauge
http_requests_in_flight{api="/v1/common/user/filter",service="common"} 2
# HELP http_response_size_bytes The size of the HTTP responses.
# TYPE http_response_size_bytes histogram
http_request_duration_seconds_bucket{code="200",api="/v1/common/user/filter",method="GET",service="common",le="0.005"} 0
http_request_duration_seconds_bucket{code="200",api="/v1/common/user/filter",method="GET",service="common",le="0.01"} 0
http_request_duration_seconds_bucket{code="200",api="/v1/common/user/filter",method="GET",service="common",le="0.025"} 0
http_request_duration_seconds_bucket{code="200",api="/v1/common/user/filter",method="GET",service="common",le="0.05"} 0
http_request_duration_seconds_bucket{code="200",api="/v1/common/user/filter",method="GET",service="common",le="0.1"} 0
http_request_duration_seconds_bucket{code="200",api="/v1/common/user/filter",method="GET",service="common",le="0.25"} 6
http_request_duration_seconds_bucket{code="200",api="/v1/common/user/filter",method="GET",service="common",le="0.5"} 6
http_request_duration_seconds_bucket{code="200",api="/v1/common/user/filter",method="GET",service="common",le="1"} 6
http_request_duration_seconds_bucket{code="200",api="/v1/common/user/filter",method="GET",service="common",le="2.5"} 6
http_request_duration_seconds_bucket{code="200",api="/v1/common/user/filter",method="GET",service="common",le="5"} 6
http_request_duration_seconds_bucket{code="200",api="/v1/common/user/filter",method="GET",service="common",le="10"} 6
http_request_duration_seconds_bucket{code="200",api="/v1/common/user/filter",method="GET",service="common",le="+Inf"} 6
http_response_size_bytes_sum{code="200",api="/v1/common/user/filter",method="GET",service="common"} 44565
http_response_size_bytes_count{code="200",api="/v1/common/user/filter",method="GET",service="common"} 5
# HELP grpc_server_handled_total Total number of RPCs completed on the server, regardless of success or failure.
# TYPE grpc_server_handled_total counter
grpc_server_handled_total{grpc_code="OK",grpc_method="AddQuotaUsers",grpc_service="ecf.common.quota.Quota",grpc_type="unary"} 0
grpc_server_handled_total{grpc_code="OK",grpc_method="Bound",grpc_service="ecf.common.ssh.SSH",grpc_type="unary"} 0
grpc_server_handled_total{grpc_code="OK",grpc_method="Check",grpc_service="ecf.common.arrears.Arrears",grpc_type="unary"} 0
grpc_server_handled_total{grpc_code="OK",grpc_method="CheckEmailVerifyCode",grpc_service="ecf.common.message.Message",grpc_type="unary"} 0
grpc_server_handled_total{grpc_code="OK",grpc_method="CheckOfferLoad",grpc_service="ecf.common.offer.CommonOffer",grpc_type="unary"} 0
# HELP grpc_server_handling_seconds Histogram of response latency (seconds) of gRPC that had been application-level handled by the server.
# TYPE grpc_server_handling_seconds histogram
grpc_server_handling_seconds_bucket{grpc_method="AddQuotaUsers",grpc_service="ecf.common.quota.Quota",grpc_type="unary",le="0.005"} 0
grpc_server_handling_seconds_bucket{grpc_method="AddQuotaUsers",grpc_service="ecf.common.quota.Quota",grpc_type="unary",le="0.01"} 0
grpc_server_handling_seconds_bucket{grpc_method="AddQuotaUsers",grpc_service="ecf.common.quota.Quota",grpc_type="unary",le="0.025"} 0
# HELP grpc_server_started_total Total number of RPCs started on the server.
# TYPE grpc_server_started_total counter
grpc_server_started_total{grpc_method="AddQuotaUsers",grpc_service="ecf.common.quota.Quota",grpc_type="unary"} 0
grpc_server_started_total{grpc_method="Bound",grpc_service="ecf.common.ssh.SSH",grpc_type="unary"} 0
grpc_server_started_total{grpc_method="Check",grpc_service="ecf.common.arrears.Arrears",grpc_type="unary"} 0
grpc_server_started_total{grpc_method="CheckEmailVerifyCode",grpc_service="ecf.common.message.Message",grpc_type="unary"} 0
grpc_server_started_total{grpc_method="CheckOfferLoad",grpc_service="ecf.common.offer.CommonOffer",grpc_type="unary"} 0
grpc_server_started_total{grpc_method="Count",grpc_service="ecf.common.cluster.Cluster",grpc_type="unary"} 0
|
三、監控平臺配置指標
| 指標組 | 指標名稱 | 指標說明 |
|---|---|---|
| http指標 |
rate(http_request_duration_seconds_count{code!="200"}[30s]) |
http接口的請求錯誤率 |
|
sum(rate(http_request_total{}[5m])) by (service) |
最近5分鐘內每秒請求速率 | |
|
|
||
|
接口平均時間大于200ms | |
|
sum(http_requests_in_flight) by (service) |
http接口的并發請求數 | |
| Go指標
|
go_gc_duration_seconds_sum |
Go程序gc時長 |
go_goroutines |
go程序協程數量 | |
go_memstats_alloc_bytes |
Go程序占用的內存大小 | |
| Gprc指標 |
sum(rate(grpc_server_handled_total{grpc_type="unary",grpc_code!="OK"}[1m])) by (grpc_service) |
grpc請求平均錯誤率 |
|
histogram_quantile(0.9, sum(rate(grpc_server_handling_seconds_bucket{grpc_type="unary"}[5m])) by (grpc_service,le) ) |
每個服務的rpc處理時間的90%-tile分位數估計值 | |
|
|
rpc接口的1分鐘內的平均響應時間大于200ms的 |
|
grpc_server_started_total |
每個grpc方法調用的總次數 |
四、采集指標代碼實現
Grpc server實現grpc指標:
// grpc/server.go
import grpc_prometheus "github.com/grpc-ecosystem/go-grpc-prometheus"
// RunServer runs gRPC service
func RunServer(ctx context.Context, port string, init InitServers) error {
// add grpc middleware
opts = grpc_interceptor.AddLogging(log.WithField("server", "grpc"), opts)
// register service
server := grpc.NewServer(opts...)
// register api
init(server)
grpc_prometheus.Register(server)
}
// grpc-interceptor/logger.go
// AddLogging returns grpc.Server config option that turn on logging.
func AddLogging(logger *logrus.Entry, opts []grpc.ServerOption) []grpc.ServerOption {
// Shared options for the logger, with a custom gRPC code to log level function.
o := []grpc_logrus.Option{
grpc_logrus.WithLevels(codeToLevel),
}
grpc_prometheus.EnableHandlingTimeHistogram()
// Add unary interceptor
opts = append(opts, grpc_middleware.WithUnaryServerChain(
grpc_prometheus.UnaryServerInterceptor,
))
// Add stream interceptor (added as an example here)
opts = append(opts, grpc_middleware.WithStreamServerChain(
grpc_prometheus.StreamServerInterceptor,
))
return opts
}
|
Grpc-gateway實現http指標:
// rest/server.go
func RunServer(ctx context.Context, httpPort string, init InitMux, interceptors ...rest_interceptor.Interceptor) error {
// more code...
h := prometheus_interceptor.AddPrometheus(rest_interceptor.AddRequestID(
rest_interceptor.AddLogger(log.WithField("gateway", "rest"), api)))
// Create our middleware.
recorder := metrics.NewRecorder(metrics.Config{
DurationBuckets: []float64{1, 2.5, 5, 10, 20, 40, 80, 160, 320, 640},
})
mdlw := middleware.New(middleware.Config{
Recorder: recorder,
Service: "common",
})
h = std.api("", mdlw, h)
srv := &http.Server{
Addr: httpPort,
ReadTimeout: 60 * time.Second,
WriteTimeout: 60 * time.Second,
// add api with middleware
api: h,
}
return srv.ListenAndServe()
}
// prometheus_interceptor/prometheus.go
func AddPrometheus(h http.api) http.api {
return http.apiFunc(func(w http.ResponseWriter, r *http.Request) {
if strings.HasPrefix(r.URL.Path, "/metrics") {
promhttp.api().ServeHTTP(w, r)
return
}
h.ServeHTTP(w, r)
return
})
}
// 參考:"github.com/slok/go-http-metrics/middleware/std"
type recorder struct {
httpRequestDurHistogram *prometheus.HistogramVec
httpResponseSizeHistogram *prometheus.HistogramVec
httpRequestsInflight *prometheus.GaugeVec
}
// NewRecorder returns a new metrics recorder that implements the recorder
// using Prometheus as the backend.
func NewRecorder(cfg Config) metrics.Recorder {
cfg.defaults()
r := &recorder{
httpRequestDurHistogram: prometheus.NewHistogramVec(prometheus.HistogramOpts{
Namespace: cfg.Prefix,
Subsystem: "http",
Name: "request_duration_seconds",
Help: "The latency of the HTTP requests.",
Buckets: cfg.DurationBuckets,
}, []string{cfg.ServiceLabel, cfg.apiIDLabel, cfg.MethodLabel, cfg.StatusCodeLabel}),
httpResponseSizeHistogram: prometheus.NewHistogramVec(prometheus.HistogramOpts{
Namespace: cfg.Prefix,
Subsystem: "http",
Name: "response_size_bytes",
Help: "The size of the HTTP responses.",
Buckets: cfg.SizeBuckets,
}, []string{cfg.ServiceLabel, cfg.apiIDLabel, cfg.MethodLabel, cfg.StatusCodeLabel}),
httpRequestsInflight: prometheus.NewGaugeVec(prometheus.GaugeOpts{
Namespace: cfg.Prefix,
Subsystem: "http",
Name: "requests_in_flight",
Help: "The number of inflight requests being handled at the same time.",
}, []string{cfg.ServiceLabel, cfg.apiIDLabel}),
}
cfg.Registry.MustRegister(
r.httpRequestDurHistogram,
r.httpResponseSizeHistogram,
r.httpRequestsInflight,
)
return r
}
func (r recorder) ObserveHTTPRequestDuration(_ context.Context, p metrics.HTTPReqProperties, duration time.Duration) {
r.httpRequestDurHistogram.WithLabelValues(p.Service, p.ID, p.Method, p.Code).Observe(duration.Seconds())
}
func (r recorder) ObserveHTTPResponseSize(_ context.Context, p metrics.HTTPReqProperties, sizeBytes int64) {
r.httpResponseSizeHistogram.WithLabelValues(p.Service, p.ID, p.Method, p.Code).Observe(float64(sizeBytes))
}
func (r recorder) AddInflightRequests(_ context.Context, p metrics.HTTPProperties, quantity int) {
r.httpRequestsInflight.WithLabelValues(p.Service, p.ID).Add(float64(quantity))
}
|