如何将监控与可观测性通过OpenTelemetry实现深度集成?

摘要:监控与可观测性:OpenTelemetry集成 前言 在前几篇文章中,我们学习了如何开发自定义工具和中间件,构建了一个功能完善的Agent系统。然而,当系统部署到生产环境后,一个至关重要的问题浮现出来:如何监控系统的运行状态?如何排查问题?
监控与可观测性:OpenTelemetry集成 前言 在前几篇文章中,我们学习了如何开发自定义工具和中间件,构建了一个功能完善的Agent系统。然而,当系统部署到生产环境后,一个至关重要的问题浮现出来:如何监控系统的运行状态?如何排查问题?如何优化性能? 这就是可观测性(Observability)的用武之地。可观测性是指通过分析系统的外部输出来理解其内部状态的能力,主要包含三个支柱:指标(Metrics)、日志(Logs)和追踪(Traces)。 本文将深入探讨如何在Agent Framework中集成OpenTelemetry,实现全面的可观测性。我们将学习如何收集关键指标、如何进行分布式追踪、如何聚合和分析日志,确保生产环境的系统稳定可靠。 一、可观测性基础 1.1 三大支柱 指标(Metrics):数值型的度量数据,如请求延迟、吞吐量、错误率等。指标可以帮助我们了解系统的整体健康状况和性能趋势。 日志(Logs):系统运行过程中产生的文本记录,包括信息、警告、错误等。日志对于排查具体问题非常有用。 追踪(Traces):记录一个请求从开始到结束经过的所有服务和组件。追踪可以帮助我们理解请求的处理流程,定位性能瓶颈。 1.2 OpenTelemetry简介 OpenTelemetry(简称OTel)是一个开源的可观测性框架,提供了标准化的指标、日志和追踪收集方式。它不依赖于特定的供应商,可以将数据发送到多种后端系统。 在.NET生态中,OpenTelemetry通过NuGet包提供支持,可以与ASP.NET Core、HttpClient、Entity Framework Core等框架无缝集成。 二、指标收集 2.1 指标基础 首先,我们定义需要收集的核心指标: // AgentMetrics.cs public class AgentMetrics { private readonly Counter<long> _requestCounter; private readonly Counter<long> _requestErrorsCounter; private readonly Histogram<double> _requestDurationHistogram; private readonly Counter<long> _tokenUsageCounter; private readonly Counter<long> _toolCallsCounter; private readonly Histogram<double> _toolDurationHistogram; private readonly Gauge<double> _activeConversationsGauge; public AgentMetrics(IMeterFactory meterFactory) { var meter = meterFactory.Create("AgentFramework"); _requestCounter = meter.CreateCounter<long>( "agent.requests.total", description: "Agent请求总数"); _requestErrorsCounter = meter.CreateCounter<long>( "agent.requests.errors.total", description: "Agent请求错误总数"); _requestDurationHistogram = meter.CreateHistogram<double>( "agent.requests.duration_ms", unit: "ms", description: "Agent请求处理时长"); _tokenUsageCounter = meter.CreateCounter<long>( "agent.tokens.usage.total", description: "Token使用量"); _toolCallsCounter = meter.CreateCounter<long>( "agent.tool.calls.total", description: "工具调用次数"); _toolDurationHistogram = meter.CreateHistogram<double>( "agent.tool.calls.duration_ms", unit: "ms", description: "工具调用时长"); _activeConversationsGauge = meter.CreateGauge<double>( "agent.conversations.active", description: "活跃对话数"); } // 记录请求开始 public IDisposable BeginRequest(string agentName, string userId) { var tags = new TagList { { "agent.name", agentName }, { "user.id", userId } }; _requestCounter.Add(1, tags); return new RequestTimer(this, agentName, userId); } // 记录请求结束 public void EndRequest( string agentName, string userId, TimeSpan duration, bool success, string? errorType = null) { var tags = new TagList { { "agent.name", agentName }, { "user.id", userId }, { "success", success.ToString() } }; if (!string.IsNullOrEmpty(errorType)) { tags.Add("error.type", errorType); } _requestDurationHistogram.Record(duration.TotalMilliseconds, tags); if (!success) { _requestErrorsCounter.Add(1, tags); } } // 记录Token使用 public void RecordTokenUsage( string agentName, int promptTokens, int completionTokens, string model) { var tags = new TagList { { "agent.name", agentName }, { "model", model } }; _tokenUsageCounter.Add(promptTokens, tags.With("token.type", "prompt")); _tokenUsageCounter.Add(completionTokens, tags.With("token.type", "completion")); } // 记录工具调用 public IDisposable BeginToolCall(string toolName) { var tags = new TagList { { "tool.name", toolName } }; _toolCallsCounter.Add(1, tags); return new ToolTimer(this, toolName); } public void EndToolCall(string toolName, TimeSpan duration, bool success) { var tags = new TagList { { "tool.name", toolName }, { "success", success.ToString() } }; _toolDurationHistogram.Record(duration.TotalMilliseconds, tags); } // 更新活跃对话数 public void SetActiveConversations(int count) { _activeConversationsGauge.Set(count); } private class RequestTimer : IDisposable { private readonly AgentMetrics _metrics; private readonly string _agentName; private readonly string _userId; private readonly Stopwatch _stopwatch; public RequestTimer(AgentMetrics metrics, string agentName, string userId) { _metrics = metrics; _agentName = agentName; _userId = userId; _stopwatch = Stopwatch.StartNew(); } public void Dispose() { _stopwatch.Stop(); _metrics.EndRequest(_agentName, _userId, _stopwatch.Elapsed, true); } } private class ToolTimer : IDisposable { private readonly AgentMetrics _metrics; private readonly string _toolName; private readonly Stopwatch _stopwatch; public ToolTimer(AgentMetrics metrics, string toolName) { _metrics = metrics; _toolName = toolName; _stopwatch = Stopwatch.StartNew(); } public void Dispose() { _stopwatch.Stop(); _metrics.EndToolCall(_toolName, _stopwatch.Elapsed, true); } } } 2.2 自定义指标 根据业务需求,我们可以添加更多自定义指标: // BusinessMetrics.cs public class BusinessMetrics { private readonly Counter<long> _conversationsStartedCounter; private readonly Counter<long> _conversationsCompletedCounter; private readonly Histogram<double> _conversationDurationHistogram; private readonly Counter<long> _intentsDetectedCounter; private readonly Counter<long> _toolSelectionCounter; private readonly Counter<long> _fallbackCounter; private readonly Histogram<double> _llmLatencyHistogram; private readonly Gauge<double> _queueDepthGauge; public BusinessMetrics(IMeterFactory meterFactory) { var meter = meterFactory.Create("AgentFramework.Business"); _conversationsStartedCounter = meter.CreateCounter<long>( "business.conversations.started", description: "新开始的对话数"); _conversationsCompletedCounter = meter.CreateCounter<long>( "business.conversations.completed", description: "完成的对话数"); _conversationDurationHistogram = meter.CreateHistogram<double>( "business.conversations.duration_ms", unit: "ms", description: "对话平均时长"); _intentsDetectedCounter = meter.CreateCounter<long>( "business.intents.detected", description: "检测到的意图数"); _toolSelectionCounter = meter.CreateCounter<long>( "business.tools.selected", description: "工具选择次数"); _fallbackCounter = meter.CreateCounter<long>( "business.fallback.occurred", description: "降级处理次数"); _llmLatencyHistogram = meter.CreateHistogram<double>( "business.llm.latency_ms", unit: "ms", description: "LLM调用延迟"); _queueDepthGauge = meter.CreateGauge<double>( "business.queue.depth", description: "消息队列深度"); } public void RecordConversationStarted(string channel) { var tags = new TagList { { "channel", channel } }; _conversationsStartedCounter.Add(1, tags); } public void RecordConversationCompleted(string channel, bool success) { var tags = new TagList { { "channel", channel }, { "success", success.ToString() } }; _conversationsCompletedCounter.Add(1, tags); } public void RecordConversationDuration(string channel, TimeSpan duration) { var tags = new TagList { { "channel", channel } }; _conversationDurationHistogram.Record(duration.TotalMilliseconds, tags); } public void RecordIntentDetected(string intent) { var tags = new TagList { { "intent", intent } }; _intentsDetectedCounter.Add(1, tags); } public void RecordToolSelected(string toolName) { var tags = new TagList { { "tool", toolName } }; _toolSelectionCounter.Add(1, tags); } public void RecordFallback(string reason) { var tags = new TagList { { "reason", reason } }; _fallbackCounter.Add(1, tags); } public void RecordLLMLatency(string model, TimeSpan latency) { var tags = new TagList { { "model", model } }; _llmLatencyHistogram.Record(latency.TotalMilliseconds, tags); } public void SetQueueDepth(int depth) { _queueDepthGauge.Set(depth); } } 三、分布式追踪 3.1 追踪基础 OpenTelemetry的追踪功能可以记录请求的完整调用链: // AgentTracing.cs public class AgentTracing { private readonly ITracer _tracer; private readonly ILogger<AgentTracing> _logger; public AgentTracing(ITracerProvider tracerProvider, ILogger<AgentTracing> logger) { _tracer = tracerProvider.GetTracer("AgentFramework"); _logger = logger; } // 创建Agent处理活动 public Activity? StartAgentActivity(string agentName, string conversationId, string userId) { var activity = _tracer.StartActivity( $"Agent.{agentName}.Process", ActivityKind.Server, new ActivityContext( ActivityTraceId.CreateFromString(Guid.NewGuid().ToString("N").AsSpan()), ActivitySpanId.CreateFromString(Guid.NewGuid().ToString("N").AsSpan()), ActivityTraceFlags.None, null)); activity?.SetTag("agent.name", agentName); activity?.SetTag("conversation.id", conversationId); activity?.SetTag("user.id", userId); return activity; } // 创建LLM调用活动 public Activity? StartLLMCallActivity(string model, int estimatedTokens) { var activity = _tracer.StartActivity( "LLM.Call", ActivityKind.Client); activity?.SetTag("llm.model", model); activity?.SetTag("llm.estimated_tokens", estimatedTokens); return activity; } // 创建工具调用活动 public Activity? StartToolActivity(string toolName, Dictionary<string, object>? parameters) { var activity = _tracer.StartActivity( $"Tool.{toolName}", ActivityKind.Internal); activity?.SetTag("tool.name", toolName); if (parameters != null) { foreach (var kvp in parameters.Take(10)) // 限制参数数量 { activity?.SetTag($"tool.param.{kvp.Key}", kvp.Value?.ToString()); } } return activity; } // 记录LLM响应 public void RecordLLMResponse( Activity activity, string response, int promptTokens, int completionTokens, TimeSpan latency) { activity.SetTag("llm.response.length", response.Length); activity.SetTag("llm.prompt_tokens", promptTokens); activity.SetTag("llm.completion_tokens", completionTokens); activity.SetTag("llm.total_tokens", promptTokens + completionTokens); activity.SetTag("llm.latency_ms", latency.TotalMilliseconds); // 记录延迟 activity.AddEvent(new ActivityEvent("LLM Response Received", DateTimeOffset.UtcNow, new ActivityTagsCollection { { "tokens", promptTokens + completionTokens }, { "latency_ms", latency.TotalMilliseconds } })); } // 记录错误 public void RecordError(Activity activity, Exception exception) { activity.SetTag("error", true); activity.SetTag("error.type", exception.GetType().Name); activity.SetTag("error.message", exception.Message); activity.AddEvent(new ActivityEvent("Error", DateTimeOffset.UtcNow, new ActivityTagsCollection { { "exception.type", exception.GetType().Name }, { "exception.message", exception.Message } })); _logger.LogError(exception, "追踪记录错误"); } } 3.2 追踪上下文传播 在分布式系统中,需要在不同服务之间传播追踪上下文: // TracingContextPropagator.cs public class TracingContextPropagator { private readonly ITextMapPropagator _propagator; public TracingContextPropagator() { // 使用W3C追踪上下文传播标准 _propagator = new CompositeTextMapPropagator( new TraceContextPropagator(), new BaggagePropagator()); } // 注入上下文到HTTP请求 public void InjectIntoRequest(HttpRequestMessage request, Activity? activity) { if (activity == null) return; var carrier = new HttpRequestMessageAdapter(request); _propagator.Inject( new PropagationContext(activity.Context, Baggage.Current), carrier, (carrier, key, value) => carrier.Headers.TryAddWithoutValidation(key, value)); } // 从HTTP请求中提取上下文 public PropagationContext ExtractFromRequest(HttpRequestMessage request) { var carrier = new HttpRequestMessageAdapter(request); var context = _propagator.Extract(carrier, (carrier, key) => { if (carrier.Headers.TryGetValues(key, out var values)) { return values.ToArray(); } return Array.Empty<string>(); }); return context; } // 传播 baggage(业务上下文) public void AddBaggage(string key, string value) { Baggage.Current = Baggage.Current.SetBaggage(key, value); } public string? GetBaggage(string key) { return Baggage.Current.GetBaggage(key); } private class HttpRequestMessageAdapter { private readonly HttpRequestMessage _request; public HttpRequestMessageAdapter(HttpRequestMessage request) { _request = request; } public HttpHeaders Headers => _request.Headers; } } 四、日志集成 4.1 结构化日志 使用.NET的日志框架结合OpenTelemetry: // AgentLogger.cs public class AgentLogger { private readonly ILogger<AgentLogger> _logger; private readonly ITraceContext _traceContext; public AgentLogger( ILogger<AgentLogger> logger, ITraceContext traceContext) { _logger = logger; _traceContext = traceContext; } public void LogAgentRequest(string agentName, string userId, string message) { _logger.LogInformation( "Agent请求: Agent={AgentName}, UserId={UserId}, Message={Message}", agentName, userId, message); } public void LogLLMCall(string model, int promptTokens, TimeSpan latency) { _logger.LogInformation( "LLM调用: Model={Model}, PromptTokens={PromptTokens}, Latency={Latency}ms", model, promptTokens, latency.TotalMilliseconds); } public void LogToolCall(string toolName, Dictionary<string, object> parameters, TimeSpan duration) { _logger.LogInformation( "工具调用: Tool={ToolName}, Duration={Duration}ms, Params={Params}", toolName, duration.TotalMilliseconds, JsonSerializer.Serialize(parameters)); } public void LogError(string operation, Exception exception) { _logger.LogError(exception, "错误: Operation={Operation}", operation); } public void LogWarning(string message, Dictionary<string, object>? context = null) { if (context != null) { _logger.LogWarning("{Message} {@Context}", message, context); } else { _logger.LogWarning("{Message}", message); } } } 4.2 日志筛选和关联 // LoggingConfiguration.cs public static class LoggingConfiguration { public static WebApplicationBuilder ConfigureLogging( this WebApplicationBuilder builder, IConfiguration configuration) { // 添加OpenTelemetry日志导出 builder.Services.AddOpenTelemetryLogExporter(); // 配置日志级别 builder.Logging.Configure(options => { options.ActivityTrackingOptions = ActivityTrackingOptions.TraceId | ActivityTrackingOptions.SpanId | ActivityTrackingOptions.Category; }); // 添加控制台输出(开发环境) if (builder.Environment.IsDevelopment()) { builder.Logging.AddConsole(options => { options.FormatterName = "json"; }); } return builder; } private static IServiceCollection AddOpenTelemetryLogExporter(this IServiceCollection services) { services.AddSingleton<OpenTelemetryLoggerProvider>(sp => { var resourceBuilder = ResourceBuilder.CreateDefault() .AddService("AgentService") .AddAttributes(new Dictionary<string, object> { { "service.version", "1.0.0" }, { "deployment.environment", Environment.GetEnvironmentVariable("ASPNETCORE_ENVIRONMENT") ?? "production" } }); var provider = new LoggerProviderBuilder() .SetResourceBuilder(resourceBuilder) .AddProcessor(new BatchLogRecordProcessor( new OtlpExporter(new OtlpExporterOptions()), new BatchLogRecordProcessorOptions())) .Build(); return provider; }); return services; } } 五、完整集成 5.1 可观测性服务注册 将所有可观测性组件整合到一起: // ObservabilityExtensions.cs public static class ObservabilityExtensions { public static IServiceCollection AddAgentObservability( this IServiceCollection services, IConfiguration configuration) { // 1. 添加OpenTelemetry services.AddOpenTelemetry() .ConfigureResource(resource => resource .AddService("AgentService") .AddAttributes(new Dictionary<string, object> { ["service.version"] = configuration["App:Version"] ?? "1.0.0", ["deployment.environment"] = configuration["Environment"] ?? "production" })) // 指标 .WithMetrics(metrics => { metrics .AddMeter("AgentFramework") .AddMeter("AgentFramework.Business") .AddRuntimeInstrumentation() .AddProcessInstrumentation() .AddHttpClientInstrumentation() .AddAspNetCoreInstrumentation(); }) // 追踪 .WithTracing(tracing => { tracing .AddSource("AgentFramework") .AddHttpClientInstrumentation() .AddAspNetCoreInstrumentation() .AddEntityFrameworkCoreInstrumentation() .SetSampler(new AlwaysOnSampler()); }); // 2. 注册指标和追踪服务 services.AddSingleton<AgentMetrics>(); services.AddSingleton<BusinessMetrics>(); services.AddSingleton<AgentTracing>(); services.AddSingleton<TracingContextPropagator>(); // 3. 添加OTLP导出器(发送到后端) services.AddGrpcOtlpExporter(); return services; } private static IServiceCollection AddGrpcOtlpExporter(this IServiceCollection services) { services.AddSingleton<OtlpExporterOptions>(sp => { var configuration = sp.GetRequiredService<IConfiguration>(); return new OtlpExporterOptions { Endpoint = new Uri(configuration["OpenTelemetry:Endpoint"] ?? "http://localhost:4317"), Protocol = OtlpProtocol.Grpc }; }); services.AddSingleton<MetricReaderOptions>(sp => { var configuration = sp.GetRequiredService<IConfiguration>(); return new MetricReaderOptions { PeriodicExportingMetricReaderOptions = new PeriodicExportingMetricReaderOptions { ExportIntervalMilliseconds = 10000 } }; }); return services; } } 5.2 Agent中间件集成 创建一个集成可观测性的Agent中间件: // ObservabilityMiddleware.cs public class ObservabilityMiddleware : IAgentMiddleware { private readonly AgentMetrics _metrics; private readonly AgentTracing _tracing; private readonly AgentLogger _logger; private readonly ILogger<ObservabilityMiddleware> _loggerFactory; public ObservabilityMiddleware( AgentMetrics metrics, AgentTracing tracing, ILogger<ObservabilityMiddleware> loggerFactory) { _metrics = metrics; _tracing = tracing; _loggerFactory = loggerFactory; _logger = new AgentLogger(loggerFactory, tracing); } public async Task InvokeAsync(AgentContext context, AgentDelegate next) { var agentName = context.Properties.GetValueOrDefault("agentName")?.ToString() ?? "default"; var activity = _tracing.StartAgentActivity(agentName, context.ConversationId, context.UserId); // 记录开始时间 var stopwatch = Stopwatch.StartNew(); try { // 记录请求开始 using var requestTimer = _metrics.BeginRequest(agentName, context.UserId); _logger.LogAgentRequest(agentName, context.UserId, context.Message); // 调用下一个中间件 await next(context); // 记录请求成功 _metrics.EndRequest(agentName, context.UserId, stopwatch.Elapsed, true); } catch (Exception ex) { // 记录错误 _metrics.EndRequest(agentName, context.UserId, stopwatch.Elapsed, false, ex.GetType().Name); if (activity != null) { _tracing.RecordError(activity, ex); } _logger.LogError("Agent处理", ex); throw; } finally { stopwatch.Stop(); if (activity != null) { activity.SetTag("duration_ms", stopwatch.Elapsed.TotalMilliseconds); activity.SetTag("success", context.Properties.GetValueOrDefault("Success", true).ToString()); activity.Dispose(); } } } } 5.3 工具追踪 为工具调用添加追踪: // ToolObservabilityExtensions.cs public static class ToolObservabilityExtensions { public static async Task<ToolExecutionResult> ExecuteWithTracing( this ITool tool, Dictionary<string, object> parameters, AgentTracing tracing, AgentMetrics metrics) { var activity = tracing.StartToolActivity(tool.Name, parameters); using var timer = metrics.BeginToolCall(tool.Name); try { var result = await tool.ExecuteAsync(parameters); // 记录成功 metrics.EndToolCall(tool.Name, TimeSpan.Zero, result.Success); if (activity != null) { activity.SetTag("success", result.Success.ToString()); if (!result.Success) { activity.SetTag("error", result.Error ?? "Unknown error"); } } return result; } catch (Exception ex) { metrics.EndToolCall(tool.Name, TimeSpan.Zero, false); if (activity != null) { tracing.RecordError(activity, ex); } throw; } finally { activity?.Dispose(); } } } 六、可视化和告警 6.1 关键仪表盘指标 定义需要监控的关键指标: // DashboardMetrics.cs public class DashboardMetrics { // 核心业务指标 // 请求量:agent.requests.total // 错误率:计算 agent.requests.errors.total / agent.requests.total // 平均延迟:agent.requests.duration_ms 平均值 // P99延迟:agent.requests.duration_ms P99 // 活跃对话:agent.conversations.active // Token使用量:agent.tokens.usage.total // LLM指标 // 模型调用延迟:business.llm.latency_ms // Token消耗:按模型分组 // 错误分布:按错误类型分组 // 工具指标 // 工具调用次数:agent.tool.calls.total // 工具延迟:agent.tool.calls.duration_ms // 热门工具:按工具名称分组 // 业务指标 // 对话完成率:business.conversations.completed / business.conversations.started // 意图识别:按意图分组 // 降级次数:business.fallback.occurred } 6.2 告警规则 // AlertRules.cs public class AlertRules { // 告警规则定义 // 1. 错误率告警 // 条件:错误率 > 5% 持续 5分钟 // 严重性:高 // 通知:钉钉/邮件/Slack // 2. 延迟告警 // 条件:P99延迟 > 10秒 持续 5分钟 // 严重性:中 // 通知:钉钉 // 3. Token限额告警 // 条件:Token使用量 > 80% 限额 // 严重性:高 // 通知:邮件 // 4. 对话失败告警 // 条件:连续失败 > 10次 // 严重性:高 // 通知:电话 // 5. 工具超时告警 // 条件:工具调用超时 > 30秒 // 严重性:中 // 通知:钉钉 public static List<AlertDefinition> GetDefaultAlerts() { return new List<AlertDefinition> { new AlertDefinition { Name = "high_error_rate", DisplayName = "错误率过高", Condition = "agent.requests.errors.total / agent.requests.total > 0.05", Duration = Duration.FromMinutes(5), Severity = AlertSeverity.High, Channels = new List<string> { "dingtalk", "email" } }, new AlertDefinition { Name = "high_latency", DisplayName = "延迟过高", Condition = "histogram_quantile(0.99, agent.requests.duration_ms) > 10000", Duration = Duration.FromMinutes(5), Severity = AlertSeverity.Medium, Channels = new List<string> { "dingtalk" } }, new AlertDefinition { Name = "token_quota_warning", DisplayName = "Token配额警告", Condition = "agent.tokens.usage.total > 0.8 * quota", Duration = Duration.FromMinutes(1), Severity = AlertSeverity.High, Channels = new List<string> { "email" } } }; } } public class AlertDefinition { public string Name { get; set; } = string.Empty; public string DisplayName { get; set; } = string.Empty; public string Condition { get; set; } = string.Empty; public Duration Duration { get; set; } public AlertSeverity Severity { get; set; } public List<string> Channels { get; set; } = new(); } public enum AlertSeverity { Low, Medium, High, Critical } 七、最佳实践 7.1 指标命名规范 遵循OpenTelemetry的命名规范: 命名空间:使用点号分隔的层级结构,如 agent.requests、llm.calls 单位:始终包含单位,如 duration_ms、count 前缀:使用通用的前缀,如 total(计数器)、current(仪表) 标签:使用小写字母和下划线,如 user_id、conversation_id 7.2 采样策略 对于高流量系统,可以采用采样策略减少数据量: // SamplingStrategy.cs public class SamplingStrategy { // 1. 始终采样:错误请求 // 2. 概率采样:正常请求(如10%) // 3. 尾部采样:慢请求(如 > 5秒) public static Sampler CreateSampler(IConfiguration configuration) { var sampleRate = configuration.GetValue<double>("OpenTelemetry:SampleRate", 0.1); return new CompositeSampler( // 错误请求始终采样 new ErrorSampler(), // 尾部采样(慢请求) new TailSampler(TimeSpan.FromSeconds(5)), // 概率采样 new ParentBasedSampler(new TraceIdRatioBasedSampler(sampleRate))); } } 7.3 性能考量 在生产环境中,需要注意: 批处理:使用批处理器导出数据,避免每次请求都发送 采样:在高流量场景下使用采样,减少数据量 资源限制:设置合理的资源限制,避免内存溢出 异步处理:所有导出操作应该异步进行,不阻塞主请求 八、总结与展望 通过本文的学习,我们已经掌握了Agent系统可观测性的核心技术: ✅ 指标收集:使用OpenTelemetry收集请求延迟、错误率、Token使用量等关键指标 ✅ 分布式追踪:记录请求的完整调用链,便于性能分析和问题排查 ✅ 日志集成:结构化日志与追踪上下文关联 ✅ 完整集成:将可观测性组件整合到Agent系统中 ✅ 告警规则:定义关键指标的告警规则 ✅ 最佳实践:遵循指标命名规范和采样策略 关键收获: 可观测性是生产环境系统不可或缺的能力。通过集成OpenTelemetry,我们可以全面了解Agent系统的运行状态,快速定位和解决问题,优化系统性能。在实际应用中,需要根据业务规模选择合适的采样策略,并建立完善的监控告警体系。 下一篇文章预告: 在第九篇文章中,我们将进入实战案例环节,构建一个企业客服智能助手。我们将综合运用前几篇文章的知识,从需求分析到系统设计,再到代码实现,完整地实现一个生产级的客服系统。 实践建议: 从一开始就集成可观测性,不要事后补救 关注关键指标:延迟、错误率、成本(Token使用) 建立完善的告警机制,及时发现和处理问题 定期回顾仪表盘数据,优化系统性能 保留足够的历史数据用于趋势分析 相关资源: OpenTelemetry .NET官方文档 OpenTelemetry指标规范 OpenTelemetry追踪规范 "可观测性不是选择,而是生产环境的必备。"