diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 0838d3270a022..e4451c20ebbd6 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -319,6 +319,11 @@ datadog_checks_base/datadog_checks/base/checks/windows/ @DataDog/wi /incident_io/manifest.json @DataDog/saas-integrations @DataDog/documentation /incident_io/assets/logs/ @DataDog/saas-integrations @DataDog/documentation @DataDog/logs-integrations-reviewers +/kandji/ @DataDog/saas-integrations +/kandji/*.md @DataDog/saas-integrations @DataDog/documentation +/kandji/manifest.json @DataDog/saas-integrations @DataDog/documentation +/kandji/assets/logs/ @DataDog/saas-integrations @DataDog/documentation @DataDog/logs-backend + /keeper/ @DataDog/saas-integrations /keeper/*.md @DataDog/saas-integrations @DataDog/documentation /keeper/manifest.json @DataDog/saas-integrations @DataDog/documentation diff --git a/.github/workflows/config/labeler.yml b/.github/workflows/config/labeler.yml index 8945763a16487..1a07add94ffdd 100644 --- a/.github/workflows/config/labeler.yml +++ b/.github/workflows/config/labeler.yml @@ -361,6 +361,8 @@ integration/kafka_consumer: - kafka_consumer/**/* integration/karpenter: - karpenter/**/* +integration/kandji: +- kandji/**/* integration/keda: - keda/**/* integration/keeper: diff --git a/ddev/changelog.d/21774.fixed b/ddev/changelog.d/21774.fixed new file mode 100644 index 0000000000000..8b18697d70b3d --- /dev/null +++ b/ddev/changelog.d/21774.fixed @@ -0,0 +1 @@ +Fixes duplicate results when filtering specific artifacts in the `ddev size` command \ No newline at end of file diff --git a/ddev/src/ddev/cli/size/utils/common_funcs.py b/ddev/src/ddev/cli/size/utils/common_funcs.py index 37cd296162258..9e33a4562c1c3 100644 --- a/ddev/src/ddev/cli/size/utils/common_funcs.py +++ b/ddev/src/ddev/cli/size/utils/common_funcs.py @@ -995,7 +995,7 @@ def get_last_dependency_sizes_artifact( size of that commit. ''' size_type = 'compressed' if compressed else 'uncompressed' - app.display(f"Retrieving dependency sizes for {commit} ({platform}, py{py_version}, {size_type})") + app.display(f"\nRetrieving dependency sizes for {commit} ({platform}, py{py_version}, {size_type})") dep_sizes_json = get_dep_sizes_json(app, commit, platform, py_version) if not dep_sizes_json: @@ -1033,6 +1033,11 @@ def get_dep_sizes_json(app: Application, current_commit: str, platform: str, py_ def get_run_id(app: Application, commit: str, workflow: str) -> str | None: app.display_debug(f"Fetching workflow run ID for {commit} ({os.path.basename(workflow)})") + if workflow == MEASURE_DISK_USAGE_WORKFLOW: + jq = f'.[] | select(.name == "Measure Disk Usage [{commit}]") | .databaseId' + else: + jq = '.[-1].databaseId' + result = subprocess.run( [ 'gh', @@ -1043,14 +1048,13 @@ def get_run_id(app: Application, commit: str, workflow: str) -> str | None: '-c', commit, '--json', - 'databaseId', + 'databaseId,name', '--jq', - '.[-1].databaseId', + jq, ], capture_output=True, text=True, ) - run_id = result.stdout.strip() if result.stdout else None if run_id: app.display_debug(f"Workflow run ID: {run_id}") @@ -1065,7 +1069,7 @@ def get_current_sizes_json(app: Application, run_id: str, platform: str, py_vers ''' Downloads the dependency sizes json for a given run id and platform when dependencies were resolved. ''' - app.display(f"Retrieving dependency sizes artifact (run={run_id}, platform={platform})") + app.display(f"\nRetrieving dependency sizes artifact (run={run_id}, platform={platform})") with tempfile.TemporaryDirectory() as tmpdir: app.display_debug(f"Downloading artifacts to {tmpdir}...") try: diff --git a/hugging_face_tgi/assets/dashboards/hugging_face_tgi_overview.json b/hugging_face_tgi/assets/dashboards/hugging_face_tgi_overview.json new file mode 100644 index 0000000000000..aacd53f61a772 --- /dev/null +++ b/hugging_face_tgi/assets/dashboards/hugging_face_tgi_overview.json @@ -0,0 +1,1646 @@ +{ + "title": "Hugging Face TGI - Overview", + "description": "# About\nMonitor your Hugging Face Text Generation Inference deployment with real-time metrics for requests, tokens, batching, and system health. (cloned)", + "widgets": [ + { + "id": 1, + "definition": { + "title": "About", + "background_color": "vivid_yellow", + "show_title": true, + "type": "group", + "layout_type": "ordered", + "widgets": [ + { + "id": 7638856905341489, + "definition": { + "type": "image", + "url": "/static/images/logos/hugging-face-tgi_large.svg", + "url_dark_theme": "/static/images/logos/hugging-face-tgi_reversed_large.svg", + "sizing": "contain", + "has_background": true, + "has_border": false, + "vertical_align": "center", + "horizontal_align": "center" + }, + "layout": { + "x": 0, + "y": 0, + "width": 5, + "height": 2 + } + }, + { + "id": 2, + "definition": { + "type": "note", + "content": "# Hugging Face TGI Monitoring\n\nTrack your Text Generation Inference server performance with key metrics for throughput, latency, token generation, and batch processing. Identify bottlenecks and optimize your deployment for better efficiency.", + "background_color": "white", + "font_size": "14", + "text_align": "left", + "vertical_align": "top", + "show_tick": false, + "tick_pos": "50%", + "tick_edge": "left", + "has_padding": true + }, + "layout": { + "x": 0, + "y": 2, + "width": 3, + "height": 3 + } + }, + { + "id": 3, + "definition": { + "type": "note", + "content": "# Useful Links\n\n[Hugging Face TGI Integration](https://docs.datadoghq.com/integrations/hugging_face_tgi)\n\n[TGI Documentation](https://huggingface.co/docs/text-generation-inference)\n\n[Hugging Face Hub](https://huggingface.co/)", + "background_color": "white", + "font_size": "14", + "text_align": "left", + "vertical_align": "top", + "show_tick": false, + "tick_pos": "50%", + "tick_edge": "left", + "has_padding": true + }, + "layout": { + "x": 3, + "y": 2, + "width": 2, + "height": 3 + } + } + ] + }, + "layout": { + "x": 0, + "y": 0, + "width": 5, + "height": 6 + } + }, + { + "id": 4, + "definition": { + "title": "System Overview", + "background_color": "vivid_yellow", + "show_title": true, + "type": "group", + "layout_type": "ordered", + "widgets": [ + { + "id": 5, + "definition": { + "type": "note", + "content": "Get an instant overview of your TGI deployment's operational status. These key performance indicators provide immediate visibility into service health, processing capacity, and potential bottlenecks that require attention.", + "background_color": "yellow", + "font_size": "16", + "text_align": "center", + "vertical_align": "center", + "show_tick": false, + "tick_pos": "50%", + "tick_edge": "left", + "has_padding": true + }, + "layout": { + "x": 0, + "y": 0, + "width": 7, + "height": 1 + } + }, + { + "id": 6, + "definition": { + "title": "Service Health", + "title_size": "16", + "title_align": "left", + "type": "check_status", + "check": "hugging_face_tgi.openmetrics.health", + "grouping": "cluster", + "group_by": ["host"], + "tags": ["$host"] + }, + "layout": { + "x": 0, + "y": 1, + "width": 2, + "height": 1 + } + }, + { + "id": 7, + "definition": { + "title": "Average Queue Size", + "title_size": "16", + "title_align": "left", + "type": "query_value", + "requests": [ + { + "formulas": [ + { + "formula": "query1" + } + ], + "queries": [ + { + "data_source": "metrics", + "name": "query1", + "query": "avg:hugging_face_tgi.queue.size{$host}", + "aggregator": "avg" + } + ], + "response_format": "scalar", + "conditional_formats": [ + { + "comparator": ">", + "value": 50, + "palette": "white_on_red" + }, + { + "comparator": ">", + "value": 30, + "palette": "white_on_yellow" + }, + { + "comparator": "<=", + "value": 30, + "palette": "white_on_green" + } + ] + } + ], + "autoscale": true, + "precision": 0 + }, + "layout": { + "x": 2, + "y": 1, + "width": 2, + "height": 1 + } + }, + { + "id": 8, + "definition": { + "title": "Monitors", + "type": "manage_status", + "display_format": "countsAndList", + "color_preference": "text", + "hide_zero_counts": true, + "show_status": true, + "last_triggered_format": "relative", + "query": "tag:(integration:hugging-face-tgi)", + "sort": "status,asc", + "count": 50, + "start": 0, + "summary_type": "monitors", + "show_priority": false, + "show_last_triggered": false + }, + "layout": { + "x": 4, + "y": 1, + "width": 3, + "height": 4 + } + }, + { + "id": 9, + "definition": { + "title": "Requests per second", + "title_size": "16", + "title_align": "left", + "type": "query_value", + "requests": [ + { + "formulas": [ + { + "formula": "throughput(query1)" + } + ], + "queries": [ + { + "data_source": "metrics", + "name": "query1", + "query": "sum:hugging_face_tgi.request.count{$host}.as_count()", + "aggregator": "sum" + } + ], + "response_format": "scalar" + } + ], + "autoscale": true, + "precision": 2, + "timeseries_background": { + "yaxis": {}, + "type": "bars" + } + }, + "layout": { + "x": 0, + "y": 2, + "width": 2, + "height": 1 + } + }, + { + "id": 10, + "definition": { + "title": "Mean Time per Token (ms)", + "title_size": "16", + "title_align": "left", + "type": "query_value", + "requests": [ + { + "formulas": [ + { + "number_format": { + "unit": { + "type": "canonical_unit", + "unit_name": "millisecond" + } + }, + "formula": "query1 / query2 * 1000" + } + ], + "queries": [ + { + "data_source": "metrics", + "name": "query1", + "query": "sum:hugging_face_tgi.request.mean_time_per_token.duration.sum{$host}.as_count()", + "aggregator": "avg" + }, + { + "data_source": "metrics", + "name": "query2", + "query": "sum:hugging_face_tgi.request.mean_time_per_token.duration.count{$host}.as_count()", + "aggregator": "avg" + } + ], + "response_format": "scalar", + "conditional_formats": [ + { + "comparator": ">", + "value": 200, + "palette": "white_on_red" + }, + { + "comparator": ">", + "value": 100, + "palette": "white_on_yellow" + }, + { + "comparator": "<=", + "value": 100, + "palette": "white_on_green" + } + ] + } + ], + "autoscale": true, + "precision": 1 + }, + "layout": { + "x": 2, + "y": 2, + "width": 2, + "height": 1 + } + }, + { + "id": 4097212373930688, + "definition": { + "title": "Host performance summary", + "title_size": "16", + "title_align": "left", + "type": "query_table", + "requests": [ + { + "queries": [ + { + "data_source": "metrics", + "name": "query1", + "query": "sum:hugging_face_tgi.request.count{$host} by {host}.as_count()", + "aggregator": "sum" + }, + { + "data_source": "metrics", + "name": "query3", + "query": "avg:hugging_face_tgi.queue.size{$host} by {host}", + "aggregator": "avg" + }, + { + "data_source": "metrics", + "name": "query2", + "query": "sum:hugging_face_tgi.request.duration.sum{$host} by {host}.as_count()", + "aggregator": "sum" + }, + { + "data_source": "metrics", + "name": "query4", + "query": "sum:hugging_face_tgi.request.duration.count{$host} by {host}.as_count()", + "aggregator": "sum" + } + ], + "response_format": "scalar", + "sort": { + "count": 500, + "order_by": [ + { + "type": "formula", + "index": 0, + "order": "desc" + } + ] + }, + "formulas": [ + { + "cell_display_mode": "bar", + "alias": "Requests per second", + "formula": "throughput(query1)" + }, + { + "cell_display_mode": "bar", + "alias": "Avg queue size", + "formula": "query3" + }, + { + "alias": "Avg req duration", + "number_format": { + "unit": { + "type": "canonical_unit", + "unit_name": "second" + } + }, + "formula": "query2 / query4", + "cell_display_mode": "bar" + } + ] + } + ], + "has_search_bar": "auto" + }, + "layout": { + "x": 0, + "y": 3, + "width": 4, + "height": 2 + } + } + ] + }, + "layout": { + "x": 5, + "y": 0, + "width": 7, + "height": 6 + } + }, + { + "id": 11, + "definition": { + "title": "Request Performance", + "background_color": "vivid_yellow", + "show_title": true, + "type": "group", + "layout_type": "ordered", + "widgets": [ + { + "id": 12, + "definition": { + "type": "note", + "content": "Analyze how your TGI server handles incoming requests over time. These metrics reveal processing patterns, help identify peak usage periods, and show the complete request lifecycle from queuing through validation to final inference completion.", + "background_color": "yellow", + "font_size": "16", + "text_align": "center", + "vertical_align": "top", + "show_tick": false, + "tick_pos": "50%", + "tick_edge": "left", + "has_padding": true + }, + "layout": { + "x": 0, + "y": 0, + "width": 12, + "height": 1 + } + }, + { + "id": 13, + "definition": { + "title": "Request count", + "title_size": "16", + "title_align": "left", + "show_legend": true, + "legend_layout": "auto", + "legend_columns": ["avg", "min", "max", "value", "sum"], + "type": "timeseries", + "requests": [ + { + "formulas": [ + { + "alias": "Success", + "style": { + "palette": "green", + "palette_index": 4 + }, + "formula": "query2" + }, + { + "alias": "Errors", + "style": { + "palette": "warm", + "palette_index": 4 + }, + "formula": "query3" + } + ], + "queries": [ + { + "data_source": "metrics", + "name": "query2", + "query": "sum:hugging_face_tgi.request.success.count{$host}.as_count()" + }, + { + "data_source": "metrics", + "name": "query3", + "query": "sum:hugging_face_tgi.request.failure.count{$host}.as_count()" + } + ], + "response_format": "timeseries", + "style": { + "line_type": "solid", + "line_width": "normal" + }, + "display_type": "bars" + } + ] + }, + "layout": { + "x": 0, + "y": 1, + "width": 6, + "height": 3 + } + }, + { + "id": 26, + "definition": { + "title": "Processing timeline per request ", + "title_size": "16", + "title_align": "left", + "show_legend": true, + "legend_layout": "auto", + "legend_columns": ["avg", "min", "max", "value", "sum"], + "type": "timeseries", + "requests": [ + { + "formulas": [ + { + "alias": "Queue Duration", + "number_format": { + "unit": { + "type": "canonical_unit", + "unit_name": "second" + } + }, + "formula": "query1 / query4" + }, + { + "alias": "Validation Duration", + "number_format": { + "unit": { + "type": "canonical_unit", + "unit_name": "second" + } + }, + "formula": "query2 / query5" + }, + { + "alias": "Inference Duration", + "number_format": { + "unit": { + "type": "canonical_unit", + "unit_name": "second" + } + }, + "formula": "query3 / query6" + } + ], + "queries": [ + { + "data_source": "metrics", + "name": "query1", + "query": "sum:hugging_face_tgi.request.queue.duration.sum{$host}.as_count()" + }, + { + "data_source": "metrics", + "name": "query4", + "query": "sum:hugging_face_tgi.request.queue.duration.count{$host}.as_count()" + }, + { + "data_source": "metrics", + "name": "query2", + "query": "sum:hugging_face_tgi.request.validation.duration.sum{$host}.as_count()" + }, + { + "data_source": "metrics", + "name": "query5", + "query": "sum:hugging_face_tgi.request.validation.duration.count{$host}.as_count()" + }, + { + "data_source": "metrics", + "name": "query3", + "query": "sum:hugging_face_tgi.request.inference.duration.sum{$host}.as_count()" + }, + { + "data_source": "metrics", + "name": "query6", + "query": "sum:hugging_face_tgi.request.inference.duration.count{$host}.as_count()" + } + ], + "response_format": "timeseries", + "style": { + "palette": "dog_classic", + "line_type": "solid", + "line_width": "normal" + }, + "display_type": "bars" + } + ] + }, + "layout": { + "x": 6, + "y": 1, + "width": 6, + "height": 3 + } + }, + { + "id": 25, + "definition": { + "title": "Queue depth over time", + "title_size": "16", + "title_align": "left", + "show_legend": true, + "legend_layout": "auto", + "legend_columns": ["avg", "min", "max", "value", "sum"], + "type": "timeseries", + "requests": [ + { + "formulas": [ + { + "alias": "Queue Size", + "formula": "query1" + } + ], + "queries": [ + { + "data_source": "metrics", + "name": "query1", + "query": "sum:hugging_face_tgi.queue.size{$host}" + } + ], + "response_format": "timeseries", + "style": { + "line_type": "solid", + "line_width": "normal" + }, + "display_type": "line" + } + ] + }, + "layout": { + "x": 0, + "y": 4, + "width": 12, + "height": 3 + } + } + ] + }, + "layout": { + "x": 0, + "y": 6, + "width": 12, + "height": 8 + } + }, + { + "id": 15, + "definition": { + "title": "Token Generation Performance", + "background_color": "vivid_yellow", + "show_title": true, + "type": "group", + "layout_type": "ordered", + "widgets": [ + { + "id": 16, + "definition": { + "type": "note", + "content": "Understand your model's token generation characteristics and performance patterns. These metrics help track the model's input and output volumes.", + "background_color": "yellow", + "font_size": "16", + "text_align": "center", + "vertical_align": "top", + "show_tick": false, + "tick_pos": "50%", + "tick_edge": "left", + "has_padding": true + }, + "layout": { + "x": 0, + "y": 0, + "width": 12, + "height": 1 + } + }, + { + "id": 18, + "definition": { + "title": "Total Input", + "title_size": "16", + "title_align": "left", + "show_legend": true, + "legend_layout": "auto", + "legend_columns": ["avg", "min", "max", "value", "sum"], + "type": "timeseries", + "requests": [ + { + "formulas": [ + { + "alias": "Input Tokens", + "formula": "query1" + } + ], + "queries": [ + { + "data_source": "metrics", + "name": "query1", + "query": "sum:hugging_face_tgi.request.input_length.sum{$host}.as_count()" + } + ], + "response_format": "timeseries", + "style": { + "palette": "orange", + "order_reverse": false, + "color_order": "monotonic", + "line_type": "solid", + "line_width": "normal" + }, + "display_type": "bars" + } + ], + "markers": [] + }, + "layout": { + "x": 0, + "y": 1, + "width": 5, + "height": 4 + } + }, + { + "id": 17, + "definition": { + "title": "Total Generated", + "title_size": "16", + "title_align": "left", + "show_legend": true, + "legend_layout": "auto", + "legend_columns": ["avg", "min", "max", "value", "sum"], + "type": "timeseries", + "requests": [ + { + "formulas": [ + { + "alias": "Generated Tokens", + "style": { + "palette": "classic", + "palette_index": 4 + }, + "formula": "query1" + } + ], + "queries": [ + { + "data_source": "metrics", + "name": "query1", + "query": "sum:hugging_face_tgi.request.generated_tokens.sum{$host}.as_count()" + } + ], + "response_format": "timeseries", + "style": { + "line_type": "solid", + "line_width": "normal" + }, + "display_type": "bars" + } + ] + }, + "layout": { + "x": 5, + "y": 1, + "width": 5, + "height": 4 + } + }, + { + "id": 2278476570708517, + "definition": { + "title": "Ingested", + "title_size": "16", + "title_align": "left", + "time": {}, + "type": "query_value", + "requests": [ + { + "formulas": [ + { + "formula": "query1" + } + ], + "queries": [ + { + "data_source": "metrics", + "name": "query1", + "query": "sum:hugging_face_tgi.request.input_length.sum{$host}.as_count()", + "aggregator": "sum" + } + ], + "response_format": "scalar" + } + ], + "autoscale": true, + "precision": 2, + "timeseries_background": { + "yaxis": {}, + "type": "bars" + } + }, + "layout": { + "x": 10, + "y": 1, + "width": 2, + "height": 2 + } + }, + { + "id": 1031719254760586, + "definition": { + "title": "Output", + "title_size": "16", + "title_align": "left", + "time": {}, + "type": "query_value", + "requests": [ + { + "formulas": [ + { + "formula": "query1" + } + ], + "queries": [ + { + "data_source": "metrics", + "name": "query1", + "query": "sum:hugging_face_tgi.request.generated_tokens.sum{$host}.as_count()", + "aggregator": "sum" + } + ], + "response_format": "scalar" + } + ], + "autoscale": true, + "precision": 2, + "timeseries_background": { + "type": "bars", + "yaxis": {} + } + }, + "layout": { + "x": 10, + "y": 3, + "width": 2, + "height": 2 + } + } + ] + }, + "layout": { + "x": 0, + "y": 14, + "width": 12, + "height": 6 + } + }, + { + "id": 19, + "definition": { + "title": "Batch Processing", + "background_color": "vivid_yellow", + "show_title": true, + "type": "group", + "layout_type": "ordered", + "widgets": [ + { + "id": 20, + "definition": { + "type": "note", + "content": "Explore how TGI optimizes inference through intelligent batching. These metrics reveal batch composition strategies, processing pipeline efficiency, and help identify opportunities to improve throughput by understanding the various stages of batch execution.", + "background_color": "yellow", + "font_size": "16", + "text_align": "center", + "vertical_align": "top", + "show_tick": false, + "tick_pos": "50%", + "tick_edge": "left", + "has_padding": true + }, + "layout": { + "x": 0, + "y": 0, + "width": 12, + "height": 1 + } + }, + { + "id": 3279775942383392, + "definition": { + "title": "Average prefill time", + "title_size": "16", + "title_align": "left", + "show_legend": true, + "legend_layout": "auto", + "legend_columns": ["avg", "min", "max", "value", "sum"], + "time": {}, + "type": "timeseries", + "requests": [ + { + "formulas": [ + { + "alias": "Concatenation", + "number_format": { + "unit": { + "type": "canonical_unit", + "unit_name": "second" + } + }, + "formula": "query1 / query2" + }, + { + "alias": "Filter", + "number_format": { + "unit": { + "type": "canonical_unit", + "unit_name": "second" + } + }, + "formula": "query3 / query4" + }, + { + "alias": "Forward", + "number_format": { + "unit": { + "type": "canonical_unit", + "unit_name": "second" + } + }, + "formula": "query5 / query6" + }, + { + "alias": "Inference - Time to First Token", + "number_format": { + "unit": { + "type": "canonical_unit", + "unit_name": "second" + } + }, + "formula": "query7 / query8" + }, + { + "alias": "Decode", + "number_format": { + "unit": { + "type": "canonical_unit", + "unit_name": "second" + } + }, + "formula": "query9 / query10" + } + ], + "queries": [ + { + "data_source": "metrics", + "name": "query1", + "query": "sum:hugging_face_tgi.batch.concat.duration.sum{$host,method:prefill}.as_count()" + }, + { + "data_source": "metrics", + "name": "query2", + "query": "sum:hugging_face_tgi.batch.concat.duration.count{$host,method:prefill}.as_count()" + }, + { + "data_source": "metrics", + "name": "query3", + "query": "sum:hugging_face_tgi.batch.filter.duration.sum{$host,method:prefill}.as_count()" + }, + { + "data_source": "metrics", + "name": "query4", + "query": "sum:hugging_face_tgi.batch.filter.duration.count{$host,method:prefill}.as_count()" + }, + { + "data_source": "metrics", + "name": "query5", + "query": "sum:hugging_face_tgi.batch.forward.duration.sum{$host,method:prefill}.as_count()" + }, + { + "data_source": "metrics", + "name": "query6", + "query": "sum:hugging_face_tgi.batch.forward.duration.count{$host,method:prefill}.as_count()" + }, + { + "data_source": "metrics", + "name": "query7", + "query": "sum:hugging_face_tgi.batch.inference.duration.sum{$host,method:prefill}.as_count()" + }, + { + "data_source": "metrics", + "name": "query8", + "query": "sum:hugging_face_tgi.batch.inference.duration.count{$host,method:prefill}.as_count()" + }, + { + "data_source": "metrics", + "name": "query9", + "query": "sum:hugging_face_tgi.batch.decode.duration.sum{$host,method:prefill}.as_count()" + }, + { + "data_source": "metrics", + "name": "query10", + "query": "sum:hugging_face_tgi.batch.decode.duration.count{$host,method:prefill}.as_count()" + } + ], + "response_format": "timeseries", + "style": { + "palette": "dog_classic", + "order_by": "values", + "line_type": "solid", + "line_width": "normal" + }, + "display_type": "line" + } + ] + }, + "layout": { + "x": 0, + "y": 1, + "width": 6, + "height": 3 + } + }, + { + "id": 5373651868721697, + "definition": { + "title": "Average decode time", + "title_size": "16", + "title_align": "left", + "show_legend": true, + "legend_layout": "auto", + "legend_columns": ["avg", "min", "max", "value", "sum"], + "time": {}, + "type": "timeseries", + "requests": [ + { + "formulas": [ + { + "alias": "Concatenation", + "number_format": { + "unit": { + "type": "canonical_unit", + "unit_name": "second" + } + }, + "formula": "query1 / query2" + }, + { + "alias": "Filter", + "number_format": { + "unit": { + "type": "canonical_unit", + "unit_name": "second" + } + }, + "formula": "query3 / query4" + }, + { + "alias": "Forward", + "number_format": { + "unit": { + "type": "canonical_unit", + "unit_name": "second" + } + }, + "formula": "query5 / query6" + }, + { + "alias": "Inference - Time Per Token", + "number_format": { + "unit": { + "type": "canonical_unit", + "unit_name": "second" + } + }, + "formula": "query7 / query8" + }, + { + "alias": "Decode", + "number_format": { + "unit": { + "type": "canonical_unit", + "unit_name": "second" + } + }, + "formula": "query9 / query10" + } + ], + "queries": [ + { + "data_source": "metrics", + "name": "query1", + "query": "sum:hugging_face_tgi.batch.concat.duration.sum{$host,method:decode}.as_count()" + }, + { + "data_source": "metrics", + "name": "query2", + "query": "sum:hugging_face_tgi.batch.concat.duration.count{$host,method:decode}.as_count()" + }, + { + "data_source": "metrics", + "name": "query3", + "query": "sum:hugging_face_tgi.batch.filter.duration.sum{$host,method:decode}.as_count()" + }, + { + "data_source": "metrics", + "name": "query4", + "query": "sum:hugging_face_tgi.batch.filter.duration.count{$host,method:decode}.as_count()" + }, + { + "data_source": "metrics", + "name": "query5", + "query": "sum:hugging_face_tgi.batch.forward.duration.sum{$host,method:decode}.as_count()" + }, + { + "data_source": "metrics", + "name": "query6", + "query": "sum:hugging_face_tgi.batch.forward.duration.count{$host,method:decode}.as_count()" + }, + { + "data_source": "metrics", + "name": "query7", + "query": "sum:hugging_face_tgi.batch.inference.duration.sum{$host,method:decode}.as_count()" + }, + { + "data_source": "metrics", + "name": "query8", + "query": "sum:hugging_face_tgi.batch.inference.duration.count{$host,method:decode}.as_count()" + }, + { + "data_source": "metrics", + "name": "query9", + "query": "sum:hugging_face_tgi.batch.decode.duration.sum{$host,method:decode}.as_count()" + }, + { + "data_source": "metrics", + "name": "query10", + "query": "sum:hugging_face_tgi.batch.decode.duration.count{$host,method:decode}.as_count()" + } + ], + "response_format": "timeseries", + "style": { + "palette": "dog_classic", + "line_type": "solid", + "line_width": "normal" + }, + "display_type": "line" + } + ] + }, + "layout": { + "x": 6, + "y": 1, + "width": 6, + "height": 3 + } + }, + { + "id": 21, + "definition": { + "title": "Batch size", + "title_size": "16", + "title_align": "left", + "show_legend": true, + "legend_layout": "auto", + "legend_columns": ["avg", "min", "max", "value", "sum"], + "type": "timeseries", + "requests": [ + { + "formulas": [ + { + "alias": "Current Batch Size", + "formula": "query1" + } + ], + "queries": [ + { + "data_source": "metrics", + "name": "query1", + "query": "avg:hugging_face_tgi.batch.current.size{$host}" + } + ], + "response_format": "timeseries", + "style": { + "palette": "dog_classic", + "line_type": "solid", + "line_width": "normal" + }, + "display_type": "line" + } + ] + }, + "layout": { + "x": 0, + "y": 4, + "width": 6, + "height": 3 + } + }, + { + "id": 2400395466412779, + "definition": { + "title": "Batch count", + "title_size": "16", + "title_align": "left", + "show_legend": true, + "legend_layout": "auto", + "legend_columns": ["avg", "min", "max", "value", "sum"], + "type": "timeseries", + "requests": [ + { + "formulas": [ + { + "alias": "Next Batch Count", + "formula": "query2" + } + ], + "queries": [ + { + "data_source": "metrics", + "name": "query2", + "query": "sum:hugging_face_tgi.batch.next.size.count{$host}.as_count()" + } + ], + "response_format": "timeseries", + "style": { + "palette": "dog_classic", + "line_type": "solid", + "line_width": "normal" + }, + "display_type": "bars" + } + ] + }, + "layout": { + "x": 6, + "y": 4, + "width": 6, + "height": 3 + } + } + ] + }, + "layout": { + "x": 0, + "y": 0, + "width": 12, + "height": 8, + "is_column_break": true + } + }, + { + "id": 2702502207226428, + "definition": { + "title": "Logs", + "background_color": "vivid_yellow", + "show_title": true, + "type": "group", + "layout_type": "ordered", + "widgets": [ + { + "id": 4434208871632602, + "definition": { + "type": "note", + "content": "Access comprehensive logging information from your TGI deployment. These logs provide detailed insights into system behavior, error patterns, and operational events that complement the performance metrics for thorough troubleshooting and monitoring.", + "background_color": "yellow", + "font_size": "16", + "text_align": "center", + "vertical_align": "top", + "show_tick": false, + "tick_pos": "50%", + "tick_edge": "left", + "has_padding": true + }, + "layout": { + "x": 0, + "y": 0, + "width": 12, + "height": 1 + } + }, + { + "id": 7649784419120106, + "definition": { + "title": "Logs count by status", + "title_size": "16", + "title_align": "left", + "show_legend": true, + "legend_layout": "auto", + "legend_columns": ["avg", "min", "max", "value", "sum"], + "type": "timeseries", + "requests": [ + { + "formulas": [ + { + "formula": "query1" + } + ], + "queries": [ + { + "name": "query1", + "data_source": "logs", + "search": { + "query": "service:text-generation-inference OR source:hugging_face_tgi" + }, + "indexes": ["*"], + "group_by": [ + { + "facet": "status", + "limit": 10, + "sort": { + "aggregation": "count", + "order": "desc", + "metric": "count" + }, + "should_exclude_missing": true + } + ], + "compute": { + "aggregation": "count" + }, + "storage": "hot" + } + ], + "response_format": "timeseries", + "style": { + "palette": "dog_classic", + "order_by": "values", + "line_type": "solid", + "line_width": "normal" + }, + "display_type": "bars" + } + ] + }, + "layout": { + "x": 0, + "y": 1, + "width": 7, + "height": 4 + } + }, + { + "id": 3743214108667418, + "definition": { + "title": "Error logs", + "title_size": "16", + "title_align": "left", + "requests": [ + { + "response_format": "event_list", + "query": { + "data_source": "logs_stream", + "query_string": "(service:text-generation-inference OR source:hugging_face_tgi) AND status:error", + "indexes": [], + "storage": "hot" + }, + "columns": [ + { + "field": "status_line", + "width": "auto" + }, + { + "field": "timestamp", + "width": "auto" + }, + { + "field": "host", + "width": "auto" + }, + { + "field": "service", + "width": "auto" + }, + { + "field": "content", + "width": "compact" + } + ] + } + ], + "type": "list_stream" + }, + "layout": { + "x": 7, + "y": 1, + "width": 5, + "height": 4 + } + } + ] + }, + "layout": { + "x": 0, + "y": 8, + "width": 12, + "height": 6 + } + }, + { + "id": 2258222563095315, + "definition": { + "title": "LLM Observability ", + "background_color": "vivid_yellow", + "show_title": true, + "type": "group", + "layout_type": "ordered", + "widgets": [ + { + "id": 467063193958253, + "definition": { + "type": "note", + "content": "[Datadog LLM Observability](https://docs.datadoghq.com/llm_observability/quickstart/?tab=python) enables you to experiment, troubleshoot, monitor, and evaluate LLM agents or applications. Get real-time visibility into inputs and outputs, errors, latency, token usage, and more, along with in-depth quality and security checks at every stage, including data retrieval, tool calls, and agent interactions.\n\nLLM Observability SDK can [automatically trace](https://docs.datadoghq.com/llm_observability/instrumentation/auto_instrumentation?tab=python) your LLM operations. Datadog's LLM Observability views integrate with Hugging Face TGI metrics to provide visibility into your workflows.\n\nFor setup instructions and auto-instrumentation, check out the [LLM Observability Quickstart Guide](https://docs.datadoghq.com/llm_observability/quickstart/?tab=python).\n\nTo explore model-level usage, token attribution, cost analysis, and request tracing across teams and providers, visit the [LLM Observability dashboard](https://app.datadoghq.com/dash/integration/llm_operational_insights).", + "background_color": "yellow", + "font_size": "14", + "text_align": "left", + "vertical_align": "center", + "show_tick": true, + "tick_pos": "50%", + "tick_edge": "left", + "has_padding": true + }, + "layout": { + "x": 0, + "y": 0, + "width": 12, + "height": 2 + } + }, + { + "id": 8256861655602834, + "definition": { + "title": "Total LLM Requests", + "title_size": "16", + "title_align": "left", + "type": "query_value", + "requests": [ + { + "formulas": [ + { + "formula": "query1" + } + ], + "queries": [ + { + "data_source": "llm_observability", + "name": "query1", + "indexes": ["*"], + "compute": { + "aggregation": "count" + }, + "group_by": [], + "search": { + "query": "@event_type:span @meta.model_provider:* @meta.span.kind:llm $host" + } + } + ], + "response_format": "scalar" + } + ], + "autoscale": true, + "precision": 2, + "timeseries_background": { + "yaxis": {}, + "type": "bars" + } + }, + "layout": { + "x": 0, + "y": 2, + "width": 4, + "height": 3 + } + }, + { + "id": 7853879441273475, + "definition": { + "title": "LLM Call Response Time (p95)", + "title_size": "16", + "title_align": "left", + "type": "query_value", + "requests": [ + { + "formulas": [ + { + "number_format": { + "unit": { + "type": "canonical_unit", + "unit_name": "second" + } + }, + "formula": "query1" + } + ], + "queries": [ + { + "aggregator": "percentile", + "data_source": "metrics", + "name": "query1", + "query": "p95:ml_obs.span.duration{span_kind:llm, $host}" + } + ], + "response_format": "scalar" + } + ], + "autoscale": true, + "precision": 2, + "timeseries_background": { + "type": "area" + } + }, + "layout": { + "x": 4, + "y": 2, + "width": 4, + "height": 3 + } + }, + { + "id": 3452479763987419, + "definition": { + "title": "LLM Call Response Time (p50)", + "title_size": "16", + "title_align": "left", + "type": "query_value", + "requests": [ + { + "formulas": [ + { + "formula": "query1", + "number_format": { + "unit": { + "type": "canonical_unit", + "unit_name": "second" + } + } + } + ], + "queries": [ + { + "aggregator": "percentile", + "data_source": "metrics", + "name": "query1", + "query": "p50:ml_obs.span.duration{span_kind:llm,$host}" + } + ], + "response_format": "scalar" + } + ], + "autoscale": true, + "precision": 2, + "timeseries_background": { + "type": "area" + } + }, + "layout": { + "x": 8, + "y": 2, + "width": 4, + "height": 3 + } + }, + { + "id": 8174429076143416, + "definition": { + "title": "Model Usage", + "title_size": "16", + "title_align": "left", + "requests": [ + { + "queries": [ + { + "compute": { + "aggregation": "count" + }, + "data_source": "llm_observability", + "group_by": [ + { + "facet": "@meta.model_provider", + "limit": 10, + "sort": { + "aggregation": "count", + "order": "desc" + } + }, + { + "facet": "@meta.model_name", + "limit": 10, + "sort": { + "aggregation": "count", + "order": "desc" + } + } + ], + "indexes": ["*"], + "name": "query2", + "search": { + "query": "@event_type:span @meta.model_provider:* @meta.span.kind:llm $host" + } + } + ], + "response_format": "scalar", + "style": { + "palette": "datadog16" + }, + "formulas": [ + { + "formula": "query2" + } + ], + "sort": { + "count": 500, + "order_by": [ + { + "index": 0, + "order": "desc", + "type": "formula" + } + ] + } + } + ], + "type": "sunburst", + "hide_total": false, + "legend": { + "type": "table" + }, + "custom_links": [ + { + "label": "View related spans in LLM Observability", + "link": "/llm/traces?query=@meta.model_name%3A{{@meta.model_name.value}}%20@event_type%3Aspan%20@parent_id%3A*%20@{{$ml_app}}%20{{$version}}&start={{timestamp_widget_start}}&end={{timestamp_widget_end}}&paused=false" + } + ] + }, + "layout": { + "x": 0, + "y": 5, + "width": 12, + "height": 5 + } + } + ] + }, + "layout": { + "x": 0, + "y": 14, + "width": 12, + "height": 11 + } + } + ], + "template_variables": [ + { + "name": "host", + "prefix": "host", + "available_values": [], + "default": "*" + } + ], + "layout_type": "ordered", + "notify_list": [], + "reflow_type": "fixed" +} diff --git a/hugging_face_tgi/images/hugging_face_tgi_overview.png b/hugging_face_tgi/images/hugging_face_tgi_overview.png new file mode 100644 index 0000000000000..a5fba55400125 Binary files /dev/null and b/hugging_face_tgi/images/hugging_face_tgi_overview.png differ diff --git a/hugging_face_tgi/manifest.json b/hugging_face_tgi/manifest.json index c8c055c561b0d..2857af91d1eb0 100644 --- a/hugging_face_tgi/manifest.json +++ b/hugging_face_tgi/manifest.json @@ -2,7 +2,7 @@ "manifest_version": "2.0.0", "app_uuid": "d4448c22-3410-41ae-93c7-ecb662237caf", "app_id": "hugging-face-tgi", - "display_on_public_website": false, + "display_on_public_website": true, "tile": { "overview": "README.md#Overview", "configuration": "README.md#Setup", @@ -10,7 +10,13 @@ "changelog": "CHANGELOG.md", "description": "Monitor the model serving performance and system health of your Hugging Face TGI servers.", "title": "Hugging Face TGI", - "media": [], + "media": [ + { + "caption": "Hugging Face TGI - Overview", + "image_url": "images/hugging_face_tgi_overview.png", + "media_type": "image" + } + ], "classifier_tags": [ "Supported OS::Linux", "Supported OS::Windows", @@ -47,6 +53,9 @@ "High queue size": "assets/monitors/queue_size_high.json", "Slow token generation": "assets/monitors/token_generation_slow.json" }, + "dashboards": { + "Hugging Face TGI Overview": "assets/dashboards/hugging_face_tgi_overview.json" + }, "saved_views": { "Hugging Face TGI Logs Overview": "assets/saved_views/logs_overview.json", "Hugging Face TGI Error Logs Overview": "assets/saved_views/error_logs_overview.json" diff --git a/kandji/CHANGELOG.md b/kandji/CHANGELOG.md new file mode 100644 index 0000000000000..f695e1be2b73a --- /dev/null +++ b/kandji/CHANGELOG.md @@ -0,0 +1,7 @@ +# CHANGELOG - kandji + +## 1.0.0 / 2025-10-14 + +***Added***: + +* Initial Release diff --git a/kandji/README.md b/kandji/README.md new file mode 100644 index 0000000000000..b9390630f820b --- /dev/null +++ b/kandji/README.md @@ -0,0 +1,39 @@ +# Agent Check: kandji + +## Overview + +This check monitors [kandji][1]. + +## Setup + +### Installation + +The kandji check is included in the [Datadog Agent][2] package. +No additional installation is needed on your server. + +### Configuration + +!!! Add list of steps to set up this integration !!! + +### Validation + +!!! Add steps to validate integration is functioning as expected !!! + +## Data Collected + +### Metrics + +kandji does not include any metrics. + +### Events + +kandji does not include any events. + +## Troubleshooting + +Need help? Contact [Datadog support][3]. + +[1]: **LINK_TO_INTEGRATION_SITE** +[2]: https://app.datadoghq.com/account/settings/agent/latest +[3]: https://docs.datadoghq.com/help/ + diff --git a/kandji/manifest.json b/kandji/manifest.json new file mode 100644 index 0000000000000..77cace3e60ae7 --- /dev/null +++ b/kandji/manifest.json @@ -0,0 +1,40 @@ +{ + "manifest_version": "2.0.0", + "app_uuid": "3677fa0b-49e4-49df-b0f2-6a3a983d6637", + "app_id": "kandji", + "display_on_public_website": false, + "tile": { + "overview": "README.md#Overview", + "configuration": "README.md#Setup", + "support": "README.md#Support", + "changelog": "CHANGELOG.md", + "description": "", + "title": "Kandji", + "media": [], + "classifier_tags": [ + "Category::Cloud", + "Category::Log Collection", + "Category::Security", + "Category::OS & System", + "Category::Automation", + "Offering::Integration", + "Submitted Data Type::Logs" + ] + }, + "assets": { + "integration": { + "auto_install": false, + "source_type_id": 58021352, + "source_type_name": "kandji", + "events": { + "creates_events": false + } + } + }, + "author": { + "support_email": "help@datadoghq.com", + "name": "Datadog", + "homepage": "https://www.datadoghq.com/", + "sales_email": "info@datadoghq.com" + } +} \ No newline at end of file