diff --git a/audit_log/jobs_in_regions.sql b/audit_log/jobs_in_regions.sql new file mode 100644 index 0000000..87599a0 --- /dev/null +++ b/audit_log/jobs_in_regions.sql @@ -0,0 +1,30 @@ +/* + * This query will return back the project, location, job_type, and a counter of the number of jobs running in the combination + * of them for all projects included in the BQ audit log. + * + * This will assist in tracking down jobs running outside of expected regions or show an overview of where job types are + * are running throughout your organization. + */ + +-- Change this value to change how far in the past the query will search +DECLARE interval_in_days INT64 DEFAULT 7; + +SELECT + protopayload_auditlog.servicedata_v1_bigquery.jobCompletedEvent.job.jobName.projectId AS project_id, + protopayload_auditlog.servicedata_v1_bigquery.jobCompletedEvent.job.jobName.location AS location, + UPPER(REPLACE(protopayload_auditlog.servicedata_v1_bigquery.jobCompletedEvent.eventName, '_job_completed', '')) AS job_type, + COUNT(3) AS job_type_counter +FROM + `..cloudaudit_googleapis_com_data_access` +WHERE + protopayload_auditlog.servicedata_v1_bigquery.jobCompletedEvent.job.jobName.jobId IS NOT NULL + AND protopayload_auditlog.servicedata_v1_bigquery.jobCompletedEvent.job.jobName.jobId NOT LIKE 'script_job_%' -- filter BQ script child jobs + AND protopayload_auditlog.servicedata_v1_bigquery.jobCompletedEvent.eventName LIKE '%_job_completed' + AND protopayload_auditlog.authenticationInfo.principalEmail IS NOT NULL + AND protopayload_auditlog.authenticationInfo.principalEmail != "" + AND (timestamp) BETWEEN TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL interval_in_days DAY) + AND CURRENT_TIMESTAMP() +GROUP BY + 1, + 2, + 3 \ No newline at end of file diff --git a/audit_log/query_counts.sql b/audit_log/query_counts.sql index 92e0711..b8796a1 100644 --- a/audit_log/query_counts.sql +++ b/audit_log/query_counts.sql @@ -72,7 +72,7 @@ WITH SELECT query, queryCount, - ROUND(SAFE_DIVIDE(totalBytesBilled, POW(1024, 3)00) * 5, 2) AS onDemandCost, + ROUND(SAFE_DIVIDE(totalBytesBilled, POW(1024, 3)) * 5, 2) AS onDemandCost, ROUND(COALESCE(totalBytesBilled, 0), 2) AS totalBytesBilled, ROUND(COALESCE(totalBytesBilled, 0) / POW(1024, 2), 2) AS totalMegabytesBilled, ROUND(COALESCE(totalBytesBilled, 0) / POW(1024, 3), 2) AS totalGigabytesBilled, diff --git a/audit_log/top_billed_queries.sql b/audit_log/top_billed_queries.sql index ffe7215..2014cc4 100644 --- a/audit_log/top_billed_queries.sql +++ b/audit_log/top_billed_queries.sql @@ -12,7 +12,7 @@ SELECT protopayload_auditlog.servicedata_v1_bigquery.jobCompletedEvent.job.jobStatistics.startTime, protopayload_auditlog.servicedata_v1_bigquery.jobCompletedEvent.job.jobStatistics.endTime, ROUND(SAFE_DIVIDE(COALESCE(protopayload_auditlog.servicedata_v1_bigquery.jobCompletedEvent.job.jobStatistics.totalBilledBytes, - 0), POW(1024, 3)00) * 5, 2) AS onDemandCost, + 0), POW(1024, 3)) * 5, 2) AS onDemandCost, ROUND(COALESCE(protopayload_auditlog.servicedata_v1_bigquery.jobCompletedEvent.job.jobStatistics.totalBilledBytes, 0), 2) AS totalBytesBilled, ROUND(COALESCE(protopayload_auditlog.servicedata_v1_bigquery.jobCompletedEvent.job.jobStatistics.totalBilledBytes, diff --git a/audit_log/top_cost_user_by_region_and_project.sql b/audit_log/top_cost_user_by_region_and_project.sql index 58933f2..b398390 100644 --- a/audit_log/top_cost_user_by_region_and_project.sql +++ b/audit_log/top_cost_user_by_region_and_project.sql @@ -11,7 +11,7 @@ WITH src AS ( protopayload_auditlog.servicedata_v1_bigquery.jobCompletedEvent.job.jobStatistics.startTime, protopayload_auditlog.servicedata_v1_bigquery.jobCompletedEvent.job.jobStatistics.endTime, protopayload_auditlog.servicedata_v1_bigquery.jobCompletedEvent.job.jobStatistics.totalBilledBytes, - ROUND(SAFE_DIVIDE(protopayload_auditlog.servicedata_v1_bigquery.jobCompletedEvent.job.jobStatistics.totalBilledBytes, POW(1024, 3)00) * 5, 2) AS onDemandCost, + ROUND(SAFE_DIVIDE(protopayload_auditlog.servicedata_v1_bigquery.jobCompletedEvent.job.jobStatistics.totalBilledBytes, POW(1024, 3)) * 5, 2) AS onDemandCost, protopayload_auditlog.servicedata_v1_bigquery.jobCompletedEvent.job.jobStatistics.totalSlotMs, TIMESTAMP_DIFF(protopayload_auditlog.servicedata_v1_bigquery.jobCompletedEvent.job.jobStatistics.endTime, protopayload_auditlog.servicedata_v1_bigquery.jobCompletedEvent.job.jobStatistics.startTime, diff --git a/information_schema/autoscale_usage_and_costs_per_second.sql b/information_schema/autoscale_usage_and_costs_per_second.sql new file mode 100644 index 0000000..bffb582 --- /dev/null +++ b/information_schema/autoscale_usage_and_costs_per_second.sql @@ -0,0 +1,78 @@ +/* + * This query retrieves a detailed, per-second timeline of how slots were utilized and assigned, helping you monitor and understand your BigQuery reservation performance. + * It is useful for observing the autoscaler behavior on a per-reservation basis. + + * Instructions for use: + * 1. Modify the and placeholders below to match your required values. + * 2. Change the interval_in_days value to travel back further in time. By default this is 7 days. + + * Note: If not using the US or EU multi-regions, then the costs may be different. + * Change the standard_edition_cost, enterprise_edition_cost, and enterprise_plus_edition_cost values below to match the actual cost listed here: + * https://cloud.google.com/bigquery/pricing?hl=en#:~:text=size%20calculation.-,Capacity%20compute%20pricing,-BigQuery%20offers%20a + */ + +-- Modify this to go further back in time +DECLARE interval_in_days INT64 DEFAULT 7; + +-- Modify these values if not using the US or EU multi-regions +-- Values can be found here: https://cloud.google.com/bigquery/pricing?hl=en#:~:text=size%20calculation.-,Capacity%20compute%20pricing,-BigQuery%20offers%20a +DECLARE standard_edition_cost NUMERIC DEFAULT 0.04; +DECLARE enterprise_edition_cost NUMERIC DEFAULT 0.06; +DECLARE enterprise_plus_edition_cost NUMERIC DEFAULT 0.10; + +WITH + slot_data AS ( + SELECT + -- Start time (aggregated per-second) + details.start_time, + reservation_name, + edition, + -- Number of slots added to the reservation by autoscaling at this second. + details.autoscale_current_slots, + ((CASE edition + WHEN 'STANDARD' THEN standard_edition_cost + WHEN 'ENTERPRISE' THEN enterprise_edition_cost + WHEN 'ENTERPRISE PLUS' THEN enterprise_plus_edition_cost + END + )/3600) * details.autoscale_current_slots AS autoscaled_cost, + -- Maximum number of slots that could be added to the reservation by autoscaling at this second. + details.autoscale_max_slots, + -- Number of slots assigned to this reservation at this second. It equals the baseline slot capacity of a reservation. + details.slots_assigned, + ((CASE edition + WHEN 'STANDARD' THEN standard_edition_cost + WHEN 'ENTERPRISE' THEN enterprise_edition_cost + WHEN 'ENTERPRISE PLUS' THEN enterprise_plus_edition_cost + END + )/3600) * details.slots_assigned AS slots_assigned_cost, + -- Maximum slot capacity for this reservation, including slot sharing at this second. + -- If ignore_idle_slots is true, this field is same as slots_assigned. + -- Otherwise, the slots_max_assigned field is the total number of slots in all capacity commitments in the administration project. + details.slots_max_assigned + FROM + -- Schema reference is here: https://cloud.google.com/bigquery/docs/information-schema-reservation-timeline#schema + -- Change the and placeholders below to match your required values + ``.``.INFORMATION_SCHEMA.RESERVATIONS_TIMELINE, + -- The per_second_details is an array of structs, so needs to be unnested and joined versus rest of the data + -- The full row is per minute, and the details are per second within that minute thus the need for an array in this column + UNNEST(per_second_details) AS details + WHERE + period_start BETWEEN TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL interval_in_days DAY) + AND CURRENT_TIMESTAMP() +) + +SELECT + start_time, + reservation_name, + autoscale_current_slots, + autoscale_max_slots, + autoscale_current_slots / autoscale_max_slots AS autoscale_usage_ratio, + autoscaled_cost, + slots_assigned slots_assigned_cost, + slots_max_assigned, + autoscaled_cost + slots_assigned_cost AS total_slots_cost, + CONCAT('$ ',FORMAT("%'.2f", autoscaled_cost + slots_assigned_cost)) AS total_slots_cost_formatted +FROM + slot_data +ORDER BY + start_time DESC diff --git a/information_schema/billing_model_cost_estimation.sql b/information_schema/billing_model_cost_estimation.sql index dbe87c2..057259d 100644 --- a/information_schema/billing_model_cost_estimation.sql +++ b/information_schema/billing_model_cost_estimation.sql @@ -1,63 +1,5 @@ /* -Query that calculates both logical and physical storage billing costs -along with compression ratio (>2.0 means cost savings in physical -billing model). Does not take into account 10gb free storage per month -given that is weighted equally between logical/physical so it cancels -out when it comes to considering which provides the most savings. - -All numbers are grouped by dataset and considered estimates only. -*/ -DECLARE active_logical_gb_price FLOAT64 DEFAULT 0.02; -DECLARE long_term_logical_gb_price FLOAT64 DEFAULT 0.01; -DECLARE active_physical_gb_price FLOAT64 DEFAULT 0.04; -DECLARE long_term_physical_gb_price FLOAT64 DEFAULT 0.02; - -with storage_sizes as -( - select - a.project_id - ,a.table_schema - ,SUM(a.active_logical_bytes) / power(1000, 3) AS active_logical_gb - ,SUM(a.long_term_logical_bytes) / power(1000, 3) AS long_term_logical_gb - ,SUM(a.active_physical_bytes) / power(1000, 3) AS active_physical_gb - ,SUM(a.long_term_physical_bytes) / power(1000, 3) AS long_term_physical_gb - ,SUM(a.total_physical_bytes) / power(1000, 3) AS total_physical_gb - ,SUM(a.total_logical_bytes) / power(1000, 3) AS total_logical_gb - from - ``.``.INFORMATION_SCHEMA.TABLE_STORAGE_BY_PROJECT a - join ``.``.INFORMATION_SCHEMA.TABLES b - on a.project_id = b.table_catalog - and a.table_schema = b.table_schema - and a.table_name = b.table_name - WHERE a.total_logical_bytes > 0 - AND a.total_physical_bytes > 0 - GROUP BY a.project_id, a.table_schema -) -,cost_calcs as -( - select - project_id - ,table_schema - ,active_logical_gb - ,active_logical_gb*active_logical_gb_price as active_logical_cost - ,long_term_logical_gb - ,long_term_logical_gb*long_term_logical_gb_price as long_term_logical_cost - ,active_physical_gb - ,active_physical_gb*active_physical_gb_price as active_physical_cost - ,long_term_physical_gb - ,long_term_physical_gb*long_term_physical_gb_price as long_term_physical_cost - ,total_logical_gb / total_physical_gb AS compression_ratio - from storage_sizes -) -select - project_id - ,table_schema - ,active_logical_gb - ,long_term_logical_gb - ,(active_logical_cost+long_term_logical_cost) as total_logical_cost - ,active_physical_gb - ,long_term_physical_gb - ,(active_physical_cost+long_term_physical_cost) as total_physical_cost - ,compression_ratio -from -cost_calcs; + * This file has been deprecated, but has links to it all over the internet. + * Please use this SQL file instead: + * https://github.com/doitintl/bigquery-optimization-queries/blob/main/information_schema/project_storage_model_recommendation.sql + */ diff --git a/information_schema/billing_recommendation_per_query.sql b/information_schema/billing_recommendation_per_query.sql index 5d9d8f8..abc2938 100644 --- a/information_schema/billing_recommendation_per_query.sql +++ b/information_schema/billing_recommendation_per_query.sql @@ -1,6 +1,13 @@ -- This query will look at every query ran over the specified timeframe and determine if it is better to be run -- under an on-demand or an Editions pricing billing model. In the final resultset, it will recommend on-demand --- or an Edition (or an Edition with a commit period) +-- or an Edition (or an Edition with a commit period) for each Edition type and commit period + +/* + * This query does some real-world estimations of actual costs of Editions when using the autoscaler. + * It gets to this by utilizing a few nuances of the autoscaler and its behavior, + * namely that each query is billed for a minimum of 1 minute and + * slots are billed in increments of 100 (rounds up to nearest 100). +*/ -- Change this value to change how far in the past the query will search DECLARE interval_in_days INT64 DEFAULT 14; @@ -26,7 +33,7 @@ WITH 0) / POW(1024, 4), 2) AS totalTerabytesBilled, TIMESTAMP_DIFF(end_time, start_time, MILLISECOND) AS executionTimeMs FROM - ``.``.INFORMATION_SCHEMA.JOBS_BY_PROJECT + ``.``.INFORMATION_SCHEMA.JOBS_BY_PROJECT WHERE creation_time BETWEEN TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL interval_in_days DAY) AND CURRENT_TIMESTAMP() @@ -34,22 +41,36 @@ WITH AND state = "DONE" ORDER BY approximateSlotCount DESC ), + rounded AS ( + SELECT + *, + -- Rounds up to the nearest 50 slots for the autoscaler + CEIL(approximateSlotCount / 50) * 50 AS roundedUpSlots, + + -- If query ran in under 1 minute (60 seconds * 1000 ms) then round up to 1 minute + IF(executionTimeMs < 1000*60, 1000*60, executionTimeMs) AS billedDurationMs, + -- Calculates the duration in hours for calculating slot/hours used + -- Formula: (Execution Time in ms)/(1000 ms * 60 seconds * 60 minutes) + IF(executionTimeMs < 1000*60, 1000*60, executionTimeMs)/(1000*60*60) AS billedDurationHour + FROM src + ), costs AS ( SELECT *, - ROUND(SAFE_DIVIDE(totalBytesBilled, - POW(1024, 4)) * 5, 2) AS legacyOnDemandCost, - ROUND(SAFE_DIVIDE(totalBytesBilled, - POW(1024, 4)) * 6.25, 2) AS onDemandCost, - (approximateSlotCount/(60*60)) * 0.04 AS standardEditionCost, - (approximateSlotCount/(60*60)) * 0.06 AS enterpriseEditionCost, - (approximateSlotCount/(60*60)) * 0.048 AS enterpriseEdition1YearCost, - (approximateSlotCount/(60*60)) * 0.036 AS enterpriseEdition3YearCost, - (approximateSlotCount/(60*60)) * 0.1 AS enterprisePlusEditionCost, - (approximateSlotCount/(60*60)) * 0.08 AS enterprisePlusEdition1YearCost, - (approximateSlotCount/(60*60)) * 0.06 AS enterprisePlusEdition3YearCost, + SAFE_DIVIDE(totalBytesBilled, + POW(1024, 4)) * 5 AS legacyOnDemandCost, + SAFE_DIVIDE(totalBytesBilled, + POW(1024, 4)) * 6.25 AS onDemandCost, + + billedDurationHour * 0.04 AS standardEditionCost, + billedDurationHour * 0.06 AS enterpriseEditionCost, + billedDurationHour * 0.048 AS enterpriseEdition1YearCost, + billedDurationHour * 0.036 AS enterpriseEdition3YearCost, + billedDurationHour * 0.1 AS enterprisePlusEditionCost, + billedDurationHour * 0.08 AS enterprisePlusEdition1YearCost, + billedDurationHour * 0.06 AS enterprisePlusEdition3YearCost FROM - src + rounded ), queryCounts AS ( SELECT @@ -65,14 +86,18 @@ WITH projectId, startTime, endTime, + billedDurationHour AS editionsBilledDurationInHours, + totalBytesBilled, totalMegabytesBilled, totalGigabytesBilled, totalTerabytesBilled, approximateSlotCount, + roundedUpSlots AS billedSlotCount, legacyOnDemandCost, onDemandCost, + standardEditionCost, enterpriseEditionCost, enterpriseEdition1YearCost, @@ -96,21 +121,24 @@ WITH costs.query = queryCounts.query ) SELECT + projectid, query, - IF(standardEditionComparison > 0, 'On-demand', 'Standard Edition') AS standardEditionRecommendation, - IF(enterpriseEditionComparison > 0, 'On-demand', 'Enterprise Edition') AS enterpriseEditionRecommendation, - IF(enterpriseEdition1YearComparison > 0, 'On-demand', 'Enterprise Edition 1 Year Commit') AS + IF(standardEditionComparison < 0, 'On-demand', 'Standard Edition') AS standardEditionRecommendation, + IF(enterpriseEditionComparison < 0, 'On-demand', 'Enterprise Edition') AS enterpriseEditionRecommendation, + IF(enterpriseEdition1YearComparison < 0, 'On-demand', 'Enterprise Edition 1 Year Commit') AS enterpriseEdition1YearRecommendation, - IF(enterpriseEdition3YearComparison > 0, 'On-demand', 'Enterprise Edition 3 Year Commit') AS enterpriseEdition3YearRecommendation, + IF(enterpriseEdition3YearComparison < 0, 'On-demand', 'Enterprise Edition 3 Year Commit') AS enterpriseEdition3YearRecommendation, - IF(enterpriseEditionComparison > 0, 'On-demand', 'Enterprise Plus Edition') AS enterprisePlusEditionRecommendation, - IF(enterpriseEdition1YearComparison > 0, 'On-demand', 'Enterprise Edition 1 Year Commit') AS + IF(enterpriseEditionComparison < 0, 'On-demand', 'Enterprise Plus Edition') AS enterprisePlusEditionRecommendation, + IF(enterpriseEdition1YearComparison < 0, 'On-demand', 'Enterprise Edition 1 Year Commit') AS enterpriseEditionPlus1YearRecommendation, - IF(enterprisePlusEdition3YearComparison > 0, 'On-demand', 'Enterprise Plus Edition 3 Year Commit') AS enterprisePlusEdition3YearRecommendation, + IF(enterprisePlusEdition3YearComparison < 0, 'On-demand', 'Enterprise Plus Edition 3 Year Commit') AS enterprisePlusEdition3YearRecommendation, startTime, endTime, + editionsBilledDurationInHours, approximateSlotCount, + billedSlotCount, legacyOnDemandCost, onDemandCost, @@ -131,4 +159,4 @@ FROM ORDER BY onDemandCost DESC, approximateSlotCount DESC; -END \ No newline at end of file +END diff --git a/information_schema/bqe_slot_utilization_by_minute.sql b/information_schema/bqe_slot_utilization_by_minute.sql new file mode 100644 index 0000000..396675c --- /dev/null +++ b/information_schema/bqe_slot_utilization_by_minute.sql @@ -0,0 +1,105 @@ +/* +This query pulls job data for a project and calculates job metrics +to the minute grain (i.e. for each minute X number of jobs running +and Y total approximate slots utilized). This rowset is then joined +to reservations timeline data to calculate estimated utilization +of available slots per minute on a designated BigQuery Editions +reservation (not on demand) by a project. + +Result set schema: + - period_start: Start time of one minute period for the reservation + - reservation_name: Name of the reservation + - slots_assigned: The number of slots assigned to this reservation (baseline slots) + - current_slots: The number of slots added to the reservation by autoscaling + - max_slots: the maximum number of slots that can be added to the reservation by autoscaling + - reservation_id: ID of the reservation in the format of "project_id:location.reservation_name" + - active_jobs_per_period: Number of jobs running or ran within a period + - slots_per_period: Approximate number of slots utilized within a period + - utilization_pct: Approximate utilization percentage of current slots within a period +*/ + +declare interval_in_days int64; +declare res_name string; +set interval_in_days = 7; +set res_name = ''; + +with reservation_data as +( + select + period_start, + reservation_name, + slots_assigned, + autoscale.current_slots, + autoscale.max_slots, + reservation_id + from + --Fill in project_id and region where reservation is set up. + .``.INFORMATION_SCHEMA.RESERVATIONS_TIMELINE + where + period_start > timestamp_sub(current_timestamp(), INTERVAL interval_in_days DAY) + and reservation_name = res_name +), +base_jobs_data as +( + --Pull all jobs data in the org sliced by the second + SELECT + --Truncate period to the minute + timestamp_trunc(a.period_start,minute) as period_start, + a.period_slot_ms, + a.job_id, + a.job_start_time, + a.job_end_time, + a.reservation_id + FROM + --Fill in project_id and region where jobs are being submitted to + .``.INFORMATION_SCHEMA.JOBS_TIMELINE_BY_ORGANIZATION a + join + --Limit jobs data only to ones that ran in the designated reservation + (select distinct reservation_id from reservation_data) b + on a.reservation_id = b.reservation_id + WHERE + period_slot_ms IS NOT NULL + and job_creation_time > timestamp_sub(current_timestamp(), INTERVAL interval_in_days DAY) + --Avoid duplication caused by script jobs due to parent/child thread scenarios + and (statement_type != "SCRIPT" OR statement_type IS NULL) +), +jobs_transform1 as +( + --Aggregate each job's periods to the minute + select + period_start, + job_id, + reservation_id, + sum(period_slot_ms) as period_slot_ms_per_minute + from base_jobs_data + group by 1,2,3 +), +jobs_transform2 as +( + --Convert slot_ms to slots in each job's period + select + period_start, + reservation_id, + count(distinct job_id) as active_jobs_per_period, + round(safe_divide(cast(sum(period_slot_ms_per_minute) as float64),60000),2) as slots_per_period + from + jobs_transform1 + group by 1,2 +) +--Join reservation periods with job periods for final recordset. +select + r.*, + coalesce(j.active_jobs_per_period,0) as active_jobs_per_period, + coalesce(j.slots_per_period,0) as slots_per_period, + case + when r.current_slots = 0 then 0 + else (j.slots_per_period/r.current_slots)*100 + end as utilization_pct +from +reservation_data r +left join +jobs_transform2 j +on r.period_start = j.period_start +and r.reservation_id = j.reservation_id +order by r.period_start desc +; diff --git a/information_schema/costs_by_query.sql b/information_schema/costs_by_query.sql new file mode 100644 index 0000000..c640175 --- /dev/null +++ b/information_schema/costs_by_query.sql @@ -0,0 +1,112 @@ +-- This query analyzes BigQuery usage over a specified timeframe +-- focusing on slot usage and data processed without edition recommendations + +-- Change this value to change how far in the past the query will search +DECLARE interval_in_days INT64 DEFAULT 7; + +BEGIN +WITH + src AS ( + SELECT + ROUND(SAFE_DIVIDE(total_slot_ms, + TIMESTAMP_DIFF(end_time, start_time, MILLISECOND)), 2) AS approximateSlotCount, + job_type, + query, + project_id AS projectId, + start_time AS startTime, + end_time AS endTime, + ROUND(COALESCE(total_bytes_billed, + 0), 2) AS totalBytesBilled, + ROUND(COALESCE(total_bytes_billed, + 0) / POW(1024, 2), 2) AS totalMegabytesBilled, + ROUND(COALESCE(total_bytes_billed, + 0) / POW(1024, 3), 2) AS totalGigabytesBilled, + ROUND(COALESCE(total_bytes_billed, + 0) / POW(1024, 4), 2) AS totalTerabytesBilled, + TIMESTAMP_DIFF(end_time, start_time, MILLISECOND) AS executionTimeMs + FROM + ``.``.INFORMATION_SCHEMA.JOBS_BY_PROJECT + WHERE + creation_time BETWEEN TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL interval_in_days DAY) + AND CURRENT_TIMESTAMP() + AND total_slot_ms IS NOT NULL + AND state = "DONE" + ORDER BY + approximateSlotCount DESC + ), + rounded AS ( + SELECT + *, + -- Rounds up to the nearest 50 slots (autoscaler increments) + floor((CEIL(approximateSlotCount) + 49) / 50) * 50 AS roundedUpSlots, + + -- If query ran in under 1 minute (60 seconds * 1000 ms) then round up to 1 minute + IF(executionTimeMs < 1000*60, 1000*60, executionTimeMs) AS billedDurationMs, + -- Calculates the duration in hours for calculating slot/hours used + -- Formula: (Execution Time in ms)/(1000 ms * 60 seconds * 60 minutes) + IF(executionTimeMs < 1000*60, 1000*60, executionTimeMs)/(1000*60*60) AS billedDurationHour, + + -- Apply minimum 10 MiB billing per query + GREATEST(totalBytesBilled, 10 * POW(1024, 2)) AS billedBytes, + GREATEST(totalMegabytesBilled, 10) AS billedMegabytes, + GREATEST(totalGigabytesBilled, 10/1024) AS billedGigabytes, + GREATEST(totalTerabytesBilled, 10/POW(1024, 2)) AS billedTerabytes + FROM src + ), + queryCounts AS ( + SELECT + query, + COUNT(query) AS queryCount + FROM + src + GROUP BY + query + ), + queryMetrics AS ( + SELECT + rounded.query, + projectId, + startTime, + endTime, + billedDurationHour, + billedBytes, + billedMegabytes, + billedGigabytes, + billedTerabytes, + approximateSlotCount, + roundedUpSlots, + -- Calculate slot-hours for this query + roundedUpSlots * billedDurationHour AS slotHours, + -- Calculate on-demand cost + SAFE_DIVIDE(billedBytes, POW(1024, 4)) * 6.25 AS onDemandCost, + queryCount + FROM + rounded + JOIN + queryCounts + ON + rounded.query = queryCounts.query + ) + +-- Final output with query metrics +SELECT + projectId, + query, + startTime, + endTime, + billedDurationHour AS queryDurationHours, + approximateSlotCount, + roundedUpSlots AS billedSlotCount, + slotHours, + onDemandCost, + queryCount AS executionCount, + billedBytes, + billedMegabytes, + billedGigabytes, + billedTerabytes +FROM + queryMetrics +ORDER BY + onDemandCost DESC, + slotHours DESC; +END diff --git a/information_schema/general_job_information.sql b/information_schema/general_job_information.sql index 0582d57..4453464 100644 --- a/information_schema/general_job_information.sql +++ b/information_schema/general_job_information.sql @@ -8,6 +8,8 @@ BEGIN SELECT user_email AS user, job_type AS jobType, + reservation_id AS reservationId, + IF(reservation_id IS NULL, 'On-Demand', 'Editions') AS queryBillingType, query, job_id AS jobId, project_id AS projectId, @@ -19,7 +21,7 @@ BEGIN ROUND(COALESCE(total_bytes_billed, 0) / POW(1024, 4), 2) AS totalTerabytesBilled, total_slot_ms AS totalSlotMs, ROUND(SAFE_DIVIDE(total_bytes_billed, - POW(1024, 4)) * 5, 2) AS onDemandCost, + POW(1024, 4)) * 6.25, 2) AS onDemandCost, ROUND(SAFE_DIVIDE(total_slot_ms, TIMESTAMP_DIFF(end_time, start_time, MILLISECOND)), 2) AS approximateSlotCount, ROW_NUMBER() OVER (PARTITION BY job_id ORDER BY end_time DESC) AS _rnk @@ -44,4 +46,4 @@ SELECT * FROM jobsDeduplicated; -END \ No newline at end of file +END diff --git a/information_schema/load_slots_per_minute.sql b/information_schema/load_slots_per_minute.sql index 87de6a1..74636af 100644 --- a/information_schema/load_slots_per_minute.sql +++ b/information_schema/load_slots_per_minute.sql @@ -1,59 +1,46 @@ --- This query will show the slot usage of all load jobs for each second in the specified timeframe. +-- This query will show the slot usage of all load jobs for each minute in the specified timeframe. -- Change this value to change how far in the past the query will search DECLARE interval_in_days INT64 DEFAULT 7; +DECLARE time_period INT64 DEFAULT (1000 * 60); -- Number of milliseconds in a minute + BEGIN WITH src AS ( SELECT - user_email AS user, - job_id AS jobId, - query, - DATETIME_TRUNC(start_time, - MINUTE) AS startTime, - DATETIME_TRUNC(end_time, - MINUTE) AS endTime, - SAFE_DIVIDE(total_slot_ms, - TIMESTAMP_DIFF(end_time, start_time, MILLISECOND)) AS slotCount, - TIMESTAMP_DIFF(end_time, start_time, MINUTE) AS diff, - ROW_NUMBER() OVER (PARTITION BY job_id ORDER BY end_time DESC) AS _rnk - FROM - ``.``.INFORMATION_SCHEMA.JOBS_BY_PROJECT - WHERE - creation_time BETWEEN TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL interval_in_days DAY) - AND CURRENT_TIMESTAMP() - AND total_slot_ms IS NOT NULL - AND job_type = 'LOAD'), - jobsDeduplicated AS ( - SELECT - * EXCEPT(_rnk) + SAFE_DIVIDE(SUM(period_slot_ms), time_period) AS slot_usage, -- Divide by 1 minute (1000 ms * 60 seconds) to convert to slots/minute + TIMESTAMP_TRUNC(period_start, MINUTE) as period_start FROM - src + ``.``.INFORMATION_SCHEMA.JOBS_TIMELINE WHERE - _rnk = 1 ), - differences AS ( - SELECT - *, - generate_timestamp_array(startTime, - endTime, - INTERVAL 1 MINUTE) AS int - FROM - jobsDeduplicated ), - byMinutes AS ( + period_start BETWEEN TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL interval_in_days DAY) AND CURRENT_TIMESTAMP() + AND job_type = 'LOAD' + GROUP BY + period_start + ORDER BY + period_start DESC + ), + time_series AS( + SELECT + * + FROM + UNNEST(generate_timestamp_array(DATETIME_TRUNC(TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL interval_in_days DAY), MINUTE), + DATETIME_TRUNC(CURRENT_TIMESTAMP(), MINUTE), + INTERVAL 1 MINUTE)) AS timeInterval + ), + joined AS ( SELECT - * EXCEPT(int) + COALESCE(src.slot_usage, 0) as slot_usage, + timeInterval FROM - differences, - UNNEST(int) AS minute ) + src RIGHT OUTER JOIN time_series + ON period_start = timeInterval + ) SELECT - slotCount, - user, - jobId, - query, - minute + * FROM - byMinutes + joined ORDER BY - minute ASC; -END \ No newline at end of file + timeInterval ASC; +END diff --git a/information_schema/load_slots_per_second.sql b/information_schema/load_slots_per_second.sql index 1b3b845..b5fec56 100644 --- a/information_schema/load_slots_per_second.sql +++ b/information_schema/load_slots_per_second.sql @@ -3,58 +3,44 @@ -- Change this value to change how far in the past the query will search DECLARE interval_in_days INT64 DEFAULT 7; +DECLARE time_period INT64 DEFAULT (1000); -- Number of milliseconds in a second + BEGIN -WITH - src AS ( - SELECT - user_email AS user, - job_id AS jobId, - query, - DATETIME_TRUNC(start_time, - SECOND) AS startTime, - DATETIME_TRUNC(end_time, - SECOND) AS endTime, - SAFE_DIVIDE(total_slot_ms, - TIMESTAMP_DIFF(end_time, start_time, MILLISECOND)) AS slotCount, - TIMESTAMP_DIFF(end_time, start_time, MINUTE) AS diff, - ROW_NUMBER() OVER (PARTITION BY job_id ORDER BY end_time DESC) AS _rnk - FROM - ``.``.INFORMATION_SCHEMA.JOBS_BY_PROJECT - WHERE - creation_time BETWEEN TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL interval_in_days DAY) - AND CURRENT_TIMESTAMP() - AND total_slot_ms IS NOT NULL - AND job_type = 'QUERY'), - jobsDeduplicated AS ( + WITH src AS ( SELECT - * EXCEPT(_rnk) + SAFE_DIVIDE(SUM(period_slot_ms), time_period) AS slotUsage, -- Divide by 1 second (1000 ms) to convert to slots/second + period_start FROM - src + ``.``.INFORMATION_SCHEMA.JOBS_TIMELINE WHERE - _rnk = 1 ), - differences AS ( - SELECT - *, - generate_timestamp_array(startTime, - endTime, - INTERVAL 1 SECOND) AS int - FROM - jobsDeduplicated ), - bySeconds AS ( + period_start BETWEEN TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL interval_in_days DAY) AND CURRENT_TIMESTAMP() + AND job_type = 'LOAD' + GROUP BY + period_start + ORDER BY + period_start DESC + ), + timeSeries AS( + SELECT + * + FROM + UNNEST(generate_timestamp_array(DATETIME_TRUNC(TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL interval_in_days DAY), SECOND), + DATETIME_TRUNC(CURRENT_TIMESTAMP(), SECOND), + INTERVAL 1 SECOND)) AS timeInterval + ), + joined AS ( SELECT - * EXCEPT(int) + COALESCE(src.slotUsage, 0) as slotUsage, + timeInterval FROM - differences, - UNNEST(int) AS second ) + src RIGHT OUTER JOIN timeSeries + ON period_start = timeInterval + ) SELECT - slotCount, - user, - jobId, - query, - second + * FROM - bySeconds + joined ORDER BY - second ASC; + timeInterval ASC; END \ No newline at end of file diff --git a/information_schema/longest_running_queries.sql b/information_schema/longest_running_queries.sql index 0932c47..d6a1813 100644 --- a/information_schema/longest_running_queries.sql +++ b/information_schema/longest_running_queries.sql @@ -58,4 +58,4 @@ FROM queriesDeduplicated ORDER BY runningTimeInSeconds DESC; -END \ No newline at end of file +END diff --git a/information_schema/project_per_table_storage_model_recommendations.sql b/information_schema/project_per_table_storage_model_recommendations.sql new file mode 100644 index 0000000..7a9265e --- /dev/null +++ b/information_schema/project_per_table_storage_model_recommendations.sql @@ -0,0 +1,190 @@ +/* + * This query will look at a single project (by default in US multi-region) and + * calculate the logical and physical billing prices for each table in all datasets + * contained inside of it then provide a recommendation on if it is better to put it + * into a dataset that uses the logical (uncompressed) or physical (compressed) + * storage billing models. + * + * Physical (also called compressed) Storage went GA on July 5, 2023. It is set at + * the dataset level, so if two or more tables are better suited for different + * billing models in the same dataset it may be best to split them into separate + * datasets to get the best cost benefit. + * + * Note that there are not any performance penalties for using physical storage as + * the underlying data is always compressed and operations operate on that compressed + * data already. This is strictly a billing change. + * + * One thing to note is that there isn't a 100% effective way to determine if an + * existing table is logical or physical storage (it only works on newly created + * tables, but this is a known issue the BQ team is working on). Once this has + * been fixed this query will be updated to show the current model. + * + * It also includes inside of the storage CTE lots of extra values that can be used + * for other calculations that are being left in here to assist you as the customer + * make the best decision or to see additional information about your tables/datasets. + * + * Note it targets the US multi-region by default. If needing to change the region + * then change `region-us` below to whichever region the data exists in. Also uncomment + * the DECLARE values below for the EU region or if you are using a non-multi-region + * then refer here: https://cloud.google.com/bigquery/pricing#storage + * for the correct pricing and update accordingly. + */ + +-- These values are for the US multi-region +-- Comment these out and uncomment below if using the EU multi-region +DECLARE active_logical_price_per_gb NUMERIC DEFAULT 0.02; +DECLARE long_term_logical_price_per_gb NUMERIC DEFAULT 0.01; +DECLARE active_physical_price_per_gb NUMERIC DEFAULT 0.04; +DECLARE long_term_physical_price_per_gb NUMERIC DEFAULT 0.02; + +-- These values are for the EU multi-region +-- Uncomment these and comment out the above if using the EU multi-region +/* +DECLARE active_logical_price_per_gb NUMERIC DEFAULT 0.02; +DECLARE long_term_logical_price_per_gb NUMERIC DEFAULT 0.01; +DECLARE active_physical_price_per_gb NUMERIC DEFAULT 0.044; +DECLARE long_term_physical_price_per_gb NUMERIC DEFAULT 0.022; +*/ + +WITH storage AS +( + SELECT DISTINCT + tb.table_name, + tb.table_schema AS dataset, + total_rows, + total_partitions, + + -- Uncompressed bytes + total_logical_bytes AS total_uncompressed_bytes, + total_logical_bytes/POW(1024, 3) AS total_logical_gibytes, + total_logical_bytes/POW(1024, 4) AS total_logical_tibytes, + active_logical_bytes AS active_uncompressed_bytes, + active_logical_bytes/POW(1024, 3) AS active_uncompressed_gibytes, + active_logical_bytes/POW(1024, 4) AS active_uncompressed_tibytes, + long_term_logical_bytes AS long_term_uncompressed_bytes, + long_term_logical_bytes/POW(1024, 3) AS long_term_uncompressed_gibytes, + long_term_logical_bytes/POW(1024, 4) AS long_term_uncompressed_tibytes, + + -- Compressed bytes + total_physical_bytes AS total_compressed_bytes, + total_physical_bytes/POW(1024, 3) AS total_compressed_gibytes, + total_physical_bytes/POW(1024, 4) AS total_compressed_tibytes, + -- Note that active physical bytes includes time travel so need to remove that + active_physical_bytes-time_travel_physical_bytes AS active_compressed_bytes, + (active_physical_bytes-time_travel_physical_bytes)/POW(1024, 3) AS active_compressed_gibytes, + (active_physical_bytes-time_travel_physical_bytes)/POW(1024, 4) AS active_compressed_tibytes, + long_term_physical_bytes AS long_term_compressed_bytes, + long_term_physical_bytes/POW(1024, 3) AS long_term_compressed_gibytes, + long_term_physical_bytes/POW(1024, 4) AS long_term_compressed_tibytes, + time_travel_physical_bytes AS time_travel_compressed_bytes, + time_travel_physical_bytes/POW(1024, 3) AS time_travel_compressed_gibytes, + time_travel_physical_bytes/POW(1024, 4) AS time_travel_compressed_tibytes, + fail_safe_physical_bytes AS fail_safe_physical_bytes, + fail_safe_physical_bytes/POW(1024, 3) AS fail_safe_compressed_gibytes, + fail_safe_physical_bytes/POW(1024, 4) AS fail_safe_compressed_tibytes, + + -- Compression ratios + SAFE_DIVIDE(total_logical_bytes, total_physical_bytes) AS total_compression_ratio, -- Defined as uncompressed size/compressed size + SAFE_DIVIDE(long_term_logical_bytes, long_term_physical_bytes) AS long_term_compression_ratio, + SAFE_DIVIDE(active_logical_bytes, active_physical_bytes) AS active_compression_ratio, + + -- Pricing + ((active_logical_bytes/POW(1024, 3))*active_logical_price_per_gb) + + ((long_term_logical_bytes/POW(1024, 3))*long_term_logical_price_per_gb) AS total_uncompressed_price, + ((active_logical_bytes/POW(1024, 3))*active_logical_price_per_gb) AS active_uncompressed_price, + ((long_term_logical_bytes/POW(1024, 3))*long_term_logical_price_per_gb) AS long_term_uncompressed_price, + (((active_physical_bytes-time_travel_physical_bytes)/POW(1024, 3))*active_physical_price_per_gb) + + ((long_term_physical_bytes/POW(1024, 3))*long_term_physical_price_per_gb) AS total_compressed_price, + (((active_physical_bytes-time_travel_physical_bytes)/POW(1024, 3))*active_physical_price_per_gb) AS active_compressed_price, + (long_term_physical_bytes/POW(1024, 3))*long_term_physical_price_per_gb AS long_term_compressed_price, + (time_travel_physical_bytes/POW(1024, 3))*active_physical_price_per_gb AS time_travel_compressed_price, + (fail_safe_physical_bytes/POW(1024, 3))*active_physical_price_per_gb AS fail_safe_compressed_price + FROM + ``.``.INFORMATION_SCHEMA.TABLE_STORAGE AS tb + + -- Need to join on TABLES for existing tables to remove any temporary or job result tables + -- Note due to this information being in the TABLE_STORAGE view this means it cannot be + -- performed across an entire organization without checking the TABLES view in each project. + JOIN ``.``.INFORMATION_SCHEMA.TABLES AS t + ON t.table_catalog = tb.project_id + AND t.table_name = tb.table_name + WHERE + tb.deleted = false + AND t.table_type = 'BASE TABLE' +), +calculations AS +( + SELECT + table_name, + dataset, + SUM(active_uncompressed_price) AS active_uncompressed_price, + SUM(active_compressed_price) AS active_compressed_price, + SUM(long_term_uncompressed_price) AS long_term_uncompressed_price, + SUM(long_term_compressed_price) AS long_term_compressed_price, + SUM(time_travel_compressed_price) AS time_travel_compressed_price, + SUM(fail_safe_compressed_price) AS fail_safe_compressed_price + FROM + storage + GROUP by + table_name, dataset +), +final_data AS +( + SELECT + dataset, + table_name, + + -- Price differences, note that >0 means physical storage is cheaper before adding in time travel and failsafe + active_uncompressed_price-active_compressed_price AS active_price_difference, + long_term_uncompressed_price-long_term_compressed_price AS long_term_price_difference, + + -- Time travel and fail safe storage reductions + (time_travel_compressed_price+fail_safe_compressed_price) AS additional_costs_for_physical_storage, + + -- Totals for each model + active_uncompressed_price+long_term_uncompressed_price AS logical_storage_price, + (active_compressed_price+long_term_compressed_price)+ + (time_travel_compressed_price+fail_safe_compressed_price) AS physical_storage_price, + + -- Difference in values (logical - active) + (active_uncompressed_price+long_term_uncompressed_price) + - + ( + (active_compressed_price+long_term_compressed_price)+ + (time_travel_compressed_price+fail_safe_compressed_price) + ) AS difference + FROM + calculations +) + +SELECT + dataset, + table_name, + + -- Logical prices and base (before adding in time travel and fail-safe reductions) physical price + CONCAT('$ ',FORMAT("%'.2f", active_uncompressed_price)) AS logical_active_price, + CONCAT('$ ',FORMAT("%'.2f", active_compressed_price)) AS base_physical_price, + CONCAT('$ ',FORMAT("%'.2f", long_term_uncompressed_price)) AS logical_long_term_price, + CONCAT('$ ',FORMAT("%'.2f", long_term_compressed_price)) AS base_long_term_price, + + -- Time travel and fail safe storage reductions + CONCAT('$ ',FORMAT("%'.2f", additional_costs_for_physical_storage)) AS additional_costs_for_physical_storage, + + -- Totals for each model + -- Note physical_storage_price is the total price with the time-travel and fail-safe reductions factored in + CONCAT('$ ',FORMAT("%'.2f", logical_storage_price)) AS logical_storage_price, + CONCAT('$ ',FORMAT("%'.2f", physical_storage_price)) AS physical_storage_price, + + -- Difference between logical storage and physical storage (logical - active) + -- Note that a negative value means logica/uncompressed is cheaper + CONCAT('$ ',FORMAT("%'.2f", difference)) AS difference_in_price_if_physical_is_chosen, + + -- Recommendation + IF(logical_storage_price < physical_storage_price, + 'Logical storage', 'Physical storage') AS recommendation, + + -- If you wish to get the raw values that are not formatted uncomment the below line + --final_data.* EXCEPT(dataset) + FROM + final_data +; diff --git a/information_schema/project_storage_model_recommendation.sql b/information_schema/project_storage_model_recommendation.sql new file mode 100644 index 0000000..ee4ce8e --- /dev/null +++ b/information_schema/project_storage_model_recommendation.sql @@ -0,0 +1,262 @@ +-- These values are for the US multi-region +-- Comment these out and uncomment below if using the EU multi-region +DECLARE active_logical_price_per_gb NUMERIC DEFAULT 0.02; +DECLARE long_term_logical_price_per_gb NUMERIC DEFAULT 0.01; +DECLARE active_physical_price_per_gb NUMERIC DEFAULT 0.04; +DECLARE long_term_physical_price_per_gb NUMERIC DEFAULT 0.02; + +-- These values are for the EU multi-region +-- Comment these out and uncomment above if using the US multi-region +/* +DECLARE active_logical_price_per_gb NUMERIC DEFAULT 0.02; +DECLARE long_term_logical_price_per_gb NUMERIC DEFAULT 0.01; +DECLARE active_physical_price_per_gb NUMERIC DEFAULT 0.044; +DECLARE long_term_physical_price_per_gb NUMERIC DEFAULT 0.022; +*/ + +-- Do not modify these two lines +DECLARE location_query_template STRING DEFAULT "SET @@location = '';\n"; +DECLARE physical_storage_query_template STRING DEFAULT "ALTER SCHEMA `` SET OPTIONS(storage_billing_model = 'PHYSICAL')"; +DECLARE logical_storage_query_template STRING DEFAULT "ALTER SCHEMA `` SET OPTIONS(storage_billing_model = 'LOGICAL')"; + +WITH storage AS +( + SELECT DISTINCT + tb.table_schema, + tb.table_name, + CONCAT(tb.PROJECT_ID, '.', tb.table_schema) AS dataset, + total_rows, + total_partitions, + + -- Uncompressed bytes + total_logical_bytes AS total_uncompressed_bytes, + total_logical_bytes/POW(1024, 3) AS total_logical_gibytes, + total_logical_bytes/POW(1024, 4) AS total_logical_tibytes, + active_logical_bytes AS active_uncompressed_bytes, + active_logical_bytes/POW(1024, 3) AS active_uncompressed_gibytes, + active_logical_bytes/POW(1024, 4) AS active_uncompressed_tibytes, + long_term_logical_bytes AS long_term_uncompressed_bytes, + long_term_logical_bytes/POW(1024, 3) AS long_term_uncompressed_gibytes, + long_term_logical_bytes/POW(1024, 4) AS long_term_uncompressed_tibytes, + + -- Compressed bytes + total_physical_bytes AS total_compressed_bytes, + total_physical_bytes/POW(1024, 3) AS total_compressed_gibytes, + total_physical_bytes/POW(1024, 4) AS total_compressed_tibytes, + -- Note that active physical bytes includes time travel so need to remove that + active_physical_bytes-time_travel_physical_bytes AS active_compressed_bytes, + (active_physical_bytes-time_travel_physical_bytes)/POW(1024, 3) AS active_compressed_gibytes, + (active_physical_bytes-time_travel_physical_bytes)/POW(1024, 4) AS active_compressed_tibytes, + long_term_physical_bytes AS long_term_compressed_bytes, + long_term_physical_bytes/POW(1024, 3) AS long_term_compressed_gibytes, + long_term_physical_bytes/POW(1024, 4) AS long_term_compressed_tibytes, + time_travel_physical_bytes AS time_travel_compressed_bytes, + time_travel_physical_bytes/POW(1024, 3) AS time_travel_compressed_gibytes, + time_travel_physical_bytes/POW(1024, 4) AS time_travel_compressed_tibytes, + fail_safe_physical_bytes AS fail_safe_physical_bytes, + fail_safe_physical_bytes/POW(1024, 3) AS fail_safe_compressed_gibytes, + fail_safe_physical_bytes/POW(1024, 4) AS fail_safe_compressed_tibytes, + + -- Compression ratios + SAFE_DIVIDE(total_logical_bytes, total_physical_bytes) AS total_compression_ratio, -- Defined as uncompressed size/compressed size + SAFE_DIVIDE(long_term_logical_bytes, long_term_physical_bytes) AS long_term_compression_ratio, + SAFE_DIVIDE(active_logical_bytes, active_physical_bytes) AS active_compression_ratio, + + -- Pricing + ((active_logical_bytes/POW(1024, 3))*active_logical_price_per_gb) + + ((long_term_logical_bytes/POW(1024, 3))*long_term_logical_price_per_gb) AS total_uncompressed_price, + ((active_logical_bytes/POW(1024, 3))*active_logical_price_per_gb) AS active_uncompressed_price, + ((long_term_logical_bytes/POW(1024, 3))*long_term_logical_price_per_gb) AS long_term_uncompressed_price, + (((active_physical_bytes-time_travel_physical_bytes)/POW(1024, 3))*active_physical_price_per_gb) + + ((long_term_physical_bytes/POW(1024, 3))*long_term_physical_price_per_gb) AS total_compressed_price, + (((active_physical_bytes-time_travel_physical_bytes)/POW(1024, 3))*active_physical_price_per_gb) AS active_compressed_price, + (long_term_physical_bytes/POW(1024, 3))*long_term_physical_price_per_gb AS long_term_compressed_price, + (time_travel_physical_bytes/POW(1024, 3))*active_physical_price_per_gb AS time_travel_compressed_price, + (fail_safe_physical_bytes/POW(1024, 3))*active_physical_price_per_gb AS fail_safe_compressed_price + FROM + -- End user: Change to reflect your project and region + ``.``.INFORMATION_SCHEMA.TABLE_STORAGE AS tb + + -- Need to join on TABLES for existing tables to remove any temporary or job result tables + -- Note due to this information being in the TABLE_STORAGE view this means it cannot be + -- performed across an entire organization without checking the TABLES view in each project. + -- End user: Change to reflect your project and region + JOIN ``.``.INFORMATION_SCHEMA.TABLES AS t + ON t.table_catalog = tb.project_id + AND t.table_name = tb.table_name + WHERE + tb.deleted = false + -- Only look at the BASE TABLE type, as this is what Google uses in their billing data to + -- bill on even if there are clones, snapshots, etc. + AND t.table_type = 'BASE TABLE' +), +schemata_options AS +( + /* + * Extract the storage billing model + * Note that if it's not listed then it's logical, or if it was converted before ~August 26, 2023 + * then it might be physical, but Google did not backfill the schemata view showing the change. + */ + SELECT + schema_name, + option_value + FROM + -- End user: Change to reflect your project and region + ``.``.INFORMATION_SCHEMA.SCHEMATA_OPTIONS + WHERE + option_name = 'storage_billing_model' +), +storage_and_dataset_billing_type AS +( + -- Gets project name and dataset name + SELECT + S.* EXCEPT(table_schema), -- Excluding table_schema to make grouping easier below and to remove redundant data + COALESCE(SO.option_value, 'LOGICAL') AS current_storage_model + FROM + storage AS S + LEFT OUTER JOIN schemata_options AS SO + ON S.table_schema = SO.schema_name +), +calculations AS +( + SELECT + dataset, + current_storage_model, + SUM(active_uncompressed_price) AS active_uncompressed_price, + SUM(active_compressed_price) AS active_compressed_price, + SUM(long_term_uncompressed_price) AS long_term_uncompressed_price, + SUM(long_term_compressed_price) AS long_term_compressed_price, + SUM(time_travel_compressed_price) AS time_travel_compressed_price, + SUM(fail_safe_compressed_price) AS fail_safe_compressed_price + FROM + storage_and_dataset_billing_type + GROUP BY + dataset, current_storage_model +), +final_data AS +( + SELECT + dataset, + + -- Price differences, note that >0 means physical storage is cheaper before adding in time travel and failsafe + active_uncompressed_price-active_compressed_price AS active_price_difference, + long_term_uncompressed_price-long_term_compressed_price AS long_term_price_difference, + + -- Time travel and fail safe storage reductions + (time_travel_compressed_price+fail_safe_compressed_price) AS additional_costs_for_physical_storage, + + -- Totals for each model + active_uncompressed_price+long_term_uncompressed_price AS logical_storage_price, + (active_compressed_price+long_term_compressed_price)+ + (time_travel_compressed_price+fail_safe_compressed_price) AS physical_storage_price, + + -- Difference in values (logical - active) + (active_uncompressed_price+long_term_uncompressed_price) - + ( + (active_compressed_price+long_term_compressed_price)+ + (time_travel_compressed_price+fail_safe_compressed_price) + ) AS difference, + + current_storage_model, + + active_uncompressed_price, + active_compressed_price, + long_term_uncompressed_price, + long_term_compressed_price + FROM + calculations +) + +SELECT + dataset, + + -- Logical prices and base (before adding in time travel and fail-safe reductions) physical price + CONCAT('$ ',FORMAT("%'.2f", active_uncompressed_price)) AS logical_active_price, + --CONCAT('$ ',FORMAT("%'.2f", active_compressed_price)) AS base_physical_price, + CONCAT('$ ',FORMAT("%'.2f", long_term_uncompressed_price)) AS logical_long_term_price, + /* renamed both physical storage types to match the logical naming conventions and grouped them together */ + CONCAT('$ ',FORMAT("%'.2f", active_compressed_price)) AS physical_active_price, + --CONCAT('$ ',FORMAT("%'.2f", long_term_compressed_price)) AS base_long_term_price, + CONCAT('$ ',FORMAT("%'.2f", long_term_compressed_price)) AS physical_long_term_price, + + -- Time travel and fail safe storage reductions + CONCAT('$ ',FORMAT("%'.2f", additional_costs_for_physical_storage)) AS additional_costs_for_physical_storage, + + -- Totals for each model + -- Note physical_storage_price is the total price with the time-travel and fail-safe reductions factored in + CONCAT('$ ',FORMAT("%'.2f", logical_storage_price)) AS logical_storage_price, + CONCAT('$ ',FORMAT("%'.2f", physical_storage_price)) AS physical_storage_price, + + -- Difference between logical storage and physical storage (logical - active) + -- Note that a negative value means logica/uncompressed is cheaper + --CONCAT('$ ',FORMAT("%'.2f", difference)) AS difference_in_price_if_physical_is_chosen, + CONCAT('$ ',FORMAT("%'.2f", difference)) AS savings_in_price_if_physical_is_chosen, + + -- Saves or Costs More Money + + -- Current storage model + /* + * Note this may be incorrect if you changed your billing model to physical prior to August 21, 2023. + * The reason is that Google didn't backfill the billing model to customers that were early adopters of the + * public version of physical storage. + * + * Bugtracker to see this: https://issuetracker.google.com/issues/296794707 + */ + current_storage_model, + + -- Recommendation for what to do + /* + * Writing this in SQL makes it more complex than it is, but it's relatively easy. + * If currrently on logical and physical storage is cheaper than recommend changing to physical storage. Otherwise recommend to not change. + * If currrently on physical and logical storage is cheaper than recommend changing to back to logical storage. Otherwise recommend to not change. + */ + IF( + current_storage_model = 'LOGICAL', + -- Is on logical storage currently + IF( + logical_storage_price < physical_storage_price, + 'Do nothing as logical storage is the best option (currently logical).', + 'Change dataset to physical storage for additional savings.' + ), + -- Is on physical storage currently + IF(logical_storage_price < physical_storage_price, + 'Dataset is currently using physical storage and costing you more money than logical storage. Change dataset back to logical storage.', + 'Do nothing as physical storage is the best option (currently physical).') + ) AS recommendation, + + -- Query to run + /* + * This looks complex again due to SQL, but uses same logic as above statement but emits SQL to make the change. + */ + IF( + current_storage_model = 'LOGICAL', + -- Is on logical storage currently + IF( + logical_storage_price < physical_storage_price, + -- Do nothing + NULL, + CONCAT( + -- Add in the location + REPLACE(location_query_template, 'region-', ''), + -- Use the change to physical storage query + REPLACE(physical_storage_query_template, '', dataset)) + ), + -- Is on physical storage currently + IF( + logical_storage_price < physical_storage_price, + CONCAT( + -- Add in the location + REPLACE(location_query_template, 'region-', ''), + -- Use the change to logical storage query + REPLACE(logical_storage_query_template, '', dataset)) + , + -- Do nothing + NULL + ) + ) AS recommendation_change_SQL + + -- If you wish to get the raw values that are not formatted uncomment the below line + --final_data.* EXCEPT(dataset) + FROM + final_data +; diff --git a/information_schema/queries_and_reservation_usage_per_second.sql b/information_schema/queries_and_reservation_usage_per_second.sql new file mode 100644 index 0000000..800ba3b --- /dev/null +++ b/information_schema/queries_and_reservation_usage_per_second.sql @@ -0,0 +1,124 @@ +/* + * This query retrieves a detailed, per-second timeline of how slots were utilized and assigned, helping you monitor and understand your BigQuery reservation performance. + * It is useful for observing the autoscaler behavior on a per-reservation basis. + + * Instructions for use: + * 1. Modify the and placeholders below to match your required values. + * 2. Change the interval_in_days value to travel back further in time. By default this is 7 days. + * + * Important Notes: + * This uses the INFORMATION_SCHEMA.TIMELINE_BY_ORGANIZATION vieW, which might need additional permissions or you might need to add specific reservations to the WHERE clause to filter out. + * The INFORMATION_SCHEMA.JOBS_TIMELINE_BY_ORGANIZATION view only retains data for 7 days, so you will only be able to go back that far in time. + * The jobs column is a repeated field (an array of structs) showing the jobs that were running during that second for that reservation. + * + * Note: If not using the US or EU multi-regions, then the costs may be different. + * Change the standard_edition_cost, enterprise_edition_cost, and enterprise_plus_edition_cost values below to match the actual cost listed here: + * https://cloud.google.com/bigquery/pricing?hl=en#:~:text=size%20calculation.-,Capacity%20compute%20pricing,-BigQuery%20offers%20a + */ + +-- Modify this to go further back in time +DECLARE interval_in_days INT64 DEFAULT 7; + +-- Modify these values if not using the US or EU multi-regions +-- Values can be found here: https://cloud.google.com/bigquery/pricing?hl=en#:~:text=size%20calculation.-,Capacity%20compute%20pricing,-BigQuery%20offers%20a +DECLARE standard_edition_cost NUMERIC DEFAULT 0.04; +DECLARE enterprise_edition_cost NUMERIC DEFAULT 0.06; +DECLARE enterprise_plus_edition_cost NUMERIC DEFAULT 0.10; + +WITH slot_data AS +( + SELECT + -- Start time (aggregated per-second) + details.start_time, + reservation_name, + edition, + -- Number of slots added to the reservation by autoscaling at this second. + details.autoscale_current_slots, + -- Maximum number of slots that could be added to the reservation by autoscaling at this second. + details.autoscale_max_slots, + -- Number of slots assigned to this reservation at this second. It equals the baseline slot capacity of a reservation. + details.slots_assigned, + -- Maximum slot capacity for this reservation, including slot sharing at this second. + -- If ignore_idle_slots is true, this field is same as slots_assigned. + -- Otherwise, the slots_max_assigned field is the total number of slots in all capacity commitments in the administration project. + details.slots_max_assigned + FROM + -- Schema reference is here: https://cloud.google.com/bigquery/docs/information-schema-reservation-timeline#schema + -- Change the and placeholders below to match your required values + ``.``.INFORMATION_SCHEMA.RESERVATIONS_TIMELINE, + -- The per_second_details is an array of structs, so needs to be unnested and joined versus rest of the data + -- The full row is per minute, and the details are per second within that minute thus the need for an array in this column + UNNEST(per_second_details) AS details + WHERE + period_start BETWEEN TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL interval_in_days DAY) + AND CURRENT_TIMESTAMP() +), +job_data AS +( + SELECT + period_start, + -- Extract the reservation name from the fully-qualified reservation ID returned by this view + SPLIT(reservation_id, '.')[OFFSET(ARRAY_LENGTH(SPLIT(reservation_id, '.')) - 1)] AS reservation_name, + -- Create an array of structs showing the jobs that were running during this period + ARRAY_AGG( + STRUCT( + period_slot_ms, + period_slot_ms/1000 AS period_slot_s, + ((CASE edition + WHEN 'STANDARD' THEN standard_edition_cost + WHEN 'ENTERPRISE' THEN enterprise_edition_cost + WHEN 'ENTERPRISE PLUS' THEN enterprise_plus_edition_cost + END + )/3600) * (period_slot_ms/1000) AS editions_cost, + project_id, + user_email, + job_id, + statement_type, + job_start_time, + state, + -- Note that the bytes billed reported are for the entire job, not just the portion that ran during this second + -- This is because the jobs can run across multiple seconds, and the view does not provide this for on-demand jobs + total_bytes_billed AS total_bytes_billed_for_full_job + ) + ) AS jobs + FROM + ``.``.INFORMATION_SCHEMA.JOBS_TIMELINE_BY_ORGANIZATION + WHERE + period_start BETWEEN TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL interval_in_days DAY) + AND CURRENT_TIMESTAMP() + -- Use the partition column to grab jobs as well to reduce bytes processed + -- This might cause some jobs to be lost if they started before the boundary, but otherwise + -- this query might process too much data for many users. + -- Note this is job_creation_time, not creation_time as the docs say + AND job_creation_time BETWEEN TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL interval_in_days DAY) + AND CURRENT_TIMESTAMP() + -- Note that this only pulls queries and excludes SCRIPT job types + -- Change this if needed for other job types + AND job_type = 'QUERY' + AND (statement_type != 'SCRIPT' OR statement_type IS NULL) + GROUP BY + period_start, reservation_name +), +combined_data AS +( + SELECT + start_time, + slot_data.reservation_name, + edition, + autoscale_current_slots, + autoscale_max_slots, + slots_assigned, + slots_max_assigned, + jobs + FROM + slot_data JOIN job_data + ON slot_data.start_time = job_data.period_start + AND slot_data.reservation_name = job_data.reservation_name +) + +SELECT + * +FROM + combined_data +ORDER BY + combined_data.start_time DESC diff --git a/information_schema/query_job_information.sql b/information_schema/query_job_information.sql index 6ed26a4..217bd46 100644 --- a/information_schema/query_job_information.sql +++ b/information_schema/query_job_information.sql @@ -16,7 +16,7 @@ WITH referenced_tables AS referencedTables, labels, ROUND(SAFE_DIVIDE(total_bytes_billed, - POW(1024, 4)) * 5, 2) AS onDemandCost, + POW(1024, 4)) * 6.25, 2) AS onDemandCost, ROUND(COALESCE(total_bytes_billed, 0), 2) AS totalBytesBilled, ROUND(COALESCE(total_bytes_billed, @@ -51,4 +51,4 @@ SELECT * FROM jobsDeduplicated; -END \ No newline at end of file +END diff --git a/information_schema/query_slots_per_minute.sql b/information_schema/query_slots_per_minute.sql index 8421001..69322ca 100644 --- a/information_schema/query_slots_per_minute.sql +++ b/information_schema/query_slots_per_minute.sql @@ -1,59 +1,46 @@ --- This query will show the query slot usage of all queries for each second in the specified timeframe. +-- This query will show the aggregate average slot usage of all query jobs for each minute in the specified timeframe. -- Change this value to change how far in the past the query will search DECLARE interval_in_days INT64 DEFAULT 7; +DECLARE time_period INT64 DEFAULT (1000 * 60); -- Number of milliseconds in a minute + BEGIN WITH src AS ( SELECT - user_email AS user, - job_id AS jobId, - query, - DATETIME_TRUNC(start_time, - MINUTE) AS startTime, - DATETIME_TRUNC(end_time, - MINUTE) AS endTime, - SAFE_DIVIDE(total_slot_ms, - TIMESTAMP_DIFF(end_time, start_time, MILLISECOND)) AS slotCount, - TIMESTAMP_DIFF(end_time, start_time, MINUTE) AS diff, - ROW_NUMBER() OVER (PARTITION BY job_id ORDER BY end_time DESC) AS _rnk - FROM - ``.``.INFORMATION_SCHEMA.JOBS_BY_PROJECT - WHERE - creation_time BETWEEN TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL interval_in_days DAY) - AND CURRENT_TIMESTAMP() - AND total_slot_ms IS NOT NULL - AND job_type = 'QUERY'), - jobsDeduplicated AS ( - SELECT - * EXCEPT(_rnk) + SAFE_DIVIDE(SUM(period_slot_ms), time_period) AS slot_usage, -- Divide by 1 minute (1000 ms * 60 seconds) to convert to slots/minute + TIMESTAMP_TRUNC(period_start, MINUTE) as period_start FROM - src + ``.``.INFORMATION_SCHEMA.JOBS_TIMELINE WHERE - _rnk = 1 ), - differences AS ( - SELECT - *, - generate_timestamp_array(startTime, - endTime, - INTERVAL 1 MINUTE) AS int - FROM - jobsDeduplicated ), - byMinutes AS ( + period_start BETWEEN TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL interval_in_days DAY) AND CURRENT_TIMESTAMP() + AND job_type = 'QUERY' + GROUP BY + period_start + ORDER BY + period_start DESC + ), + time_series AS( + SELECT + * + FROM + UNNEST(generate_timestamp_array(DATETIME_TRUNC(TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL interval_in_days DAY), MINUTE), + DATETIME_TRUNC(CURRENT_TIMESTAMP(), MINUTE), + INTERVAL 1 MINUTE)) AS timeInterval + ), + joined AS ( SELECT - * EXCEPT(int) + COALESCE(src.slot_usage, 0) as slot_usage, + timeInterval FROM - differences, - UNNEST(int) AS minute ) + src RIGHT OUTER JOIN time_series + ON period_start = timeInterval + ) SELECT - slotCount, - user, - jobId, - query, - minute + * FROM - byMinutes + joined ORDER BY - minute ASC; + timeInterval ASC; END \ No newline at end of file diff --git a/information_schema/query_slots_per_minute_and_user.sql b/information_schema/query_slots_per_minute_and_user.sql new file mode 100644 index 0000000..59a4798 --- /dev/null +++ b/information_schema/query_slots_per_minute_and_user.sql @@ -0,0 +1,50 @@ +-- This query will show the aggregate average slot usage of all jobs for each minute in the specified timeframe and also group by user +-- queries over that period. + +-- Change this value to change how far in the past the query will search +DECLARE interval_in_days INT64 DEFAULT 7; + +DECLARE time_period INT64 DEFAULT (1000 * 60); -- Number of milliseconds in a minute + +BEGIN + WITH src AS ( + SELECT + SAFE_DIVIDE(SUM(period_slot_ms), time_period) AS slot_usage, -- Divide by 1 minute (1000 ms * 60 seconds) to convert to slots/minute + user_email, + TIMESTAMP_TRUNC(period_start, MINUTE) as period_start + FROM + ``.``.INFORMATION_SCHEMA.JOBS_TIMELINE + WHERE + period_start BETWEEN TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL interval_in_days DAY) AND CURRENT_TIMESTAMP() + AND job_type = 'QUERY' + GROUP BY + period_start, + user_email + ORDER BY + period_start DESC + ), + time_series AS( + SELECT + * + FROM + UNNEST(generate_timestamp_array(DATETIME_TRUNC(TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL interval_in_days DAY), MINUTE), + DATETIME_TRUNC(CURRENT_TIMESTAMP(), MINUTE), + INTERVAL 1 MINUTE)) AS timeInterval + ), + joined AS ( + SELECT + COALESCE(src.slot_usage, 0) as slot_usage, + user_email, + timeInterval + FROM + src RIGHT OUTER JOIN time_series + ON period_start = timeInterval + ) + +SELECT + * +FROM + joined +ORDER BY + timeInterval ASC; +END \ No newline at end of file diff --git a/information_schema/query_slots_per_second.sql b/information_schema/query_slots_per_second.sql index 2adf19b..96621ad 100644 --- a/information_schema/query_slots_per_second.sql +++ b/information_schema/query_slots_per_second.sql @@ -1,60 +1,46 @@ --- This query will show the query slot usage of all queries for each second in the specified timeframe. +-- This query will show the aggregate average slot usage of all query jobs for each second in the specified timeframe. -- Change this value to change how far in the past the query will search DECLARE interval_in_days INT64 DEFAULT 7; +DECLARE time_period INT64 DEFAULT (1000); -- Number of milliseconds in a second + BEGIN -WITH - src AS ( - SELECT - user_email AS user, - job_id AS jobId, - query, - DATETIME_TRUNC(start_time, - SECOND) AS startTime, - DATETIME_TRUNC(end_time, - SECOND) AS endTime, - SAFE_DIVIDE(total_slot_ms, - TIMESTAMP_DIFF(end_time, start_time, MILLISECOND)) AS slotCount, - TIMESTAMP_DIFF(end_time, start_time, MINUTE) AS diff, - ROW_NUMBER() OVER (PARTITION BY job_id ORDER BY end_time DESC) AS _rnk - FROM - ``.``.INFORMATION_SCHEMA.JOBS_BY_PROJECT - WHERE - creation_time BETWEEN TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL interval_in_days DAY) - AND CURRENT_TIMESTAMP() - AND total_slot_ms IS NOT NULL - AND job_type = 'QUERY'), - jobsDeduplicated AS ( + WITH src AS ( SELECT - * EXCEPT(_rnk) + SAFE_DIVIDE(SUM(period_slot_ms), time_period) AS slotUsage, -- Divide by 1 second (1000 ms) to convert to slots/second + period_start FROM - src + ``.``.INFORMATION_SCHEMA.JOBS_TIMELINE WHERE - _rnk = 1 ), - differences AS ( - SELECT - *, - generate_timestamp_array(startTime, - endTime, - INTERVAL 1 SECOND) AS int - FROM - jobsDeduplicated ), - bySeconds AS ( + period_start BETWEEN TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL interval_in_days DAY) AND CURRENT_TIMESTAMP() + AND job_type = 'QUERY' + GROUP BY + period_start + ORDER BY + period_start DESC + ), + timeSeries AS( + SELECT + * + FROM + UNNEST(generate_timestamp_array(DATETIME_TRUNC(TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL interval_in_days DAY), SECOND), + DATETIME_TRUNC(CURRENT_TIMESTAMP(), SECOND), + INTERVAL 1 SECOND)) AS timeInterval + ), + joined AS ( SELECT - * EXCEPT(int) + COALESCE(src.slotUsage, 0) as slotUsage, + timeInterval FROM - differences, - UNNEST(int) AS second ) + src RIGHT OUTER JOIN timeSeries + ON period_start = timeInterval + ) SELECT - slotCount, - user, - jobId, - query, - second + * FROM - bySeconds + joined ORDER BY - second ASC; + timeInterval ASC; END \ No newline at end of file diff --git a/information_schema/queued_queries_by_second.sql b/information_schema/queued_queries_by_second.sql new file mode 100644 index 0000000..b0a1d15 --- /dev/null +++ b/information_schema/queued_queries_by_second.sql @@ -0,0 +1,35 @@ +/* +This query will show the number of queued jobs broken down by seconds, regardless +of job type. + +Nested query can be used separately to identify job_ids when a hotspot timeframe has been +identified. +*/ + +-- Change this value to change how far in the past the query will search +DECLARE interval_in_days INT64 DEFAULT 1; + +with base_data as +( + select + period_start, + job_id, + state, + error_result.reason, + error_result.location, + error_result.message + from + ``.``.INFORMATION_SCHEMA.JOBS_TIMELINE_BY_PROJECT + where + job_creation_time BETWEEN TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL interval_in_days DAY) + AND CURRENT_TIMESTAMP() + AND state = 'PENDING' +) + +select + period_start, + count(distinct job_id) as queued_jobs_cnt +from + base_data +group by 1 +order by 1; \ No newline at end of file diff --git a/information_schema/slots_by_day.sql b/information_schema/slots_by_day.sql index 78d719e..fcfb487 100644 --- a/information_schema/slots_by_day.sql +++ b/information_schema/slots_by_day.sql @@ -3,24 +3,24 @@ -- Change this value to change how far in the past the query will search DECLARE interval_in_days INT64 DEFAULT 7; -DECLARE time_period INT64; -SET time_period = (1000*60*60*24); -- Number of milliseconds in a day +DECLARE time_period INT64 DEFAULT (1000*60*60*24); -- Number of milliseconds in a hour BEGIN WITH src AS ( SELECT - SAFE_DIVIDE(SUM(total_slot_ms), time_period) AS slotUsage, - DATETIME_TRUNC(creation_time, - DAY) AS creationTime + SAFE_DIVIDE(SUM(period_slot_ms), time_period) AS slot_usage, -- Divide by 1 minute (1000 ms * 60 seconds) to convert to slots/minute + TIMESTAMP_TRUNC(period_start, MINUTE) as period_start FROM - ``.``.INFORMATION_SCHEMA.JOBS_BY_PROJECT + ``.``.INFORMATION_SCHEMA.JOBS_TIMELINE WHERE - creation_time BETWEEN TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL interval_in_days DAY) - AND CURRENT_TIMESTAMP() - AND total_slot_ms IS NOT NULL + period_start BETWEEN TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL interval_in_days DAY) AND CURRENT_TIMESTAMP() + AND statement_type <> 'SCRIPT' -- Exclude scripts since they pull in child process slots and skew results GROUP BY - creationTime), - timeSeries AS( + period_start + ORDER BY + period_start DESC + ), + time_series AS( SELECT * FROM @@ -30,11 +30,12 @@ BEGIN ), joined AS ( SELECT - COALESCE(src.slotUsage, 0) as slotUsage, + COALESCE(src.slot_usage, 0) as slot_usage, timeInterval FROM - src RIGHT OUTER JOIN timeSeries - ON creationTime = timeInterval) + src RIGHT OUTER JOIN time_series + ON period_start = timeInterval + ) SELECT * @@ -42,4 +43,4 @@ FROM joined ORDER BY timeInterval ASC; -END \ No newline at end of file +END diff --git a/information_schema/slots_by_hour.sql b/information_schema/slots_by_hour.sql index 25b643e..45f3276 100644 --- a/information_schema/slots_by_hour.sql +++ b/information_schema/slots_by_hour.sql @@ -3,24 +3,24 @@ -- Change this value to change how far in the past the query will search DECLARE interval_in_days INT64 DEFAULT 7; -DECLARE time_period INT64; -SET time_period = (1000*60*60); -- Number of milliseconds in a hour +DECLARE time_period INT64 DEFAULT (1000*60*60); -- Number of milliseconds in a hour BEGIN WITH src AS ( SELECT - SAFE_DIVIDE(SUM(total_slot_ms), time_period) AS slotUsage, - DATETIME_TRUNC(creation_time, - HOUR) AS creationTime + SAFE_DIVIDE(SUM(period_slot_ms), time_period) AS slot_usage, -- Divide by 1 minute (1000 ms * 60 seconds) to convert to slots/minute + TIMESTAMP_TRUNC(period_start, MINUTE) as period_start FROM - ``.``.INFORMATION_SCHEMA.JOBS_BY_PROJECT + ``.``.INFORMATION_SCHEMA.JOBS_TIMELINE WHERE - creation_time BETWEEN TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL interval_in_days DAY) - AND CURRENT_TIMESTAMP() - AND total_slot_ms IS NOT NULL + period_start BETWEEN TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL interval_in_days DAY) AND CURRENT_TIMESTAMP() + AND statement_type <> 'SCRIPT' -- Exclude scripts since they pull in child process slots and skew results GROUP BY - creationTime), - timeSeries AS( + period_start + ORDER BY + period_start DESC + ), + time_series AS( SELECT * FROM @@ -30,11 +30,12 @@ BEGIN ), joined AS ( SELECT - COALESCE(src.slotUsage, 0) as slotUsage, + COALESCE(src.slot_usage, 0) as slot_usage, timeInterval FROM - src RIGHT OUTER JOIN timeSeries - ON creationTime = timeInterval) + src RIGHT OUTER JOIN time_series + ON period_start = timeInterval + ) SELECT * diff --git a/information_schema/slots_by_minute.sql b/information_schema/slots_by_minute.sql index 7a75239..604a719 100644 --- a/information_schema/slots_by_minute.sql +++ b/information_schema/slots_by_minute.sql @@ -3,24 +3,24 @@ -- Change this value to change how far in the past the query will search DECLARE interval_in_days INT64 DEFAULT 7; -DECLARE time_period INT64; -SET time_period = (1000*60); -- Number of milliseconds in a minute +DECLARE time_period INT64 DEFAULT (1000 * 60); -- Number of milliseconds in a minute BEGIN WITH src AS ( SELECT - SAFE_DIVIDE(SUM(total_slot_ms), time_period) AS slotUsage, - DATETIME_TRUNC(creation_time, - MINUTE) AS creationTime + SAFE_DIVIDE(SUM(period_slot_ms), time_period) AS slot_usage, -- Divide by 1 minute (1000 ms * 60 seconds) to convert to slots/minute + TIMESTAMP_TRUNC(period_start, MINUTE) as period_start FROM - ``.``.INFORMATION_SCHEMA.JOBS_BY_PROJECT + ``.``.INFORMATION_SCHEMA.JOBS_TIMELINE WHERE - creation_time BETWEEN TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL interval_in_days DAY) - AND CURRENT_TIMESTAMP() - AND total_slot_ms IS NOT NULL + period_start BETWEEN TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL interval_in_days DAY) AND CURRENT_TIMESTAMP() + AND statement_type <> 'SCRIPT' -- Exclude scripts since they pull in child process slots and skew results GROUP BY - creationTime), - timeSeries AS( + period_start + ORDER BY + period_start DESC + ), + time_series AS( SELECT * FROM @@ -30,11 +30,12 @@ BEGIN ), joined AS ( SELECT - COALESCE(src.slotUsage, 0) as slotUsage, + COALESCE(src.slot_usage, 0) as slot_usage, timeInterval FROM - src RIGHT OUTER JOIN timeSeries - ON creationTime = timeInterval) + src RIGHT OUTER JOIN time_series + ON period_start = timeInterval + ) SELECT * diff --git a/information_schema/slots_by_minute_and_user.sql b/information_schema/slots_by_minute_and_user.sql index 2b0ca02..4cf154c 100644 --- a/information_schema/slots_by_minute_and_user.sql +++ b/information_schema/slots_by_minute_and_user.sql @@ -1,46 +1,48 @@ --- This query will show the slot usage of all jobs for each minute in the interval for each user +-- This query will show the aggregate average slot usage of all jobs for each minute in the specified timeframe and also group by user +-- queries over that period. -- Change this value to change how far in the past the query will search DECLARE interval_in_days INT64 DEFAULT 7; -DECLARE time_period INT64; -SET time_period = (1000*60); -- Number of milliseconds in a minute +DECLARE time_period INT64 DEFAULT (1000 * 60); -- Number of milliseconds in a minute BEGIN WITH src AS ( SELECT - user_email AS user, - SAFE_DIVIDE(SUM(total_slot_ms), time_period) AS slotUsage, - DATETIME_TRUNC(creation_time, MINUTE) AS creationTime + SAFE_DIVIDE(SUM(period_slot_ms), time_period) AS slot_usage, -- Divide by 1 minute (1000 ms * 60 seconds) to convert to slots/minute + user_email, + TIMESTAMP_TRUNC(period_start, MINUTE) as period_start FROM - ``.``.INFORMATION_SCHEMA.JOBS_BY_PROJECT + ``.``.INFORMATION_SCHEMA.JOBS_TIMELINE WHERE - creation_time BETWEEN TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL interval_in_days DAY) - AND CURRENT_TIMESTAMP() - AND total_slot_ms IS NOT NULL + period_start BETWEEN TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL interval_in_days DAY) AND CURRENT_TIMESTAMP() + AND statement_type <> 'SCRIPT' -- Exclude scripts since they pull in child process slots and skew results GROUP BY - user_email, creationTime), - timeSeries AS( - SELECT - * - FROM - UNNEST(generate_timestamp_array(DATETIME_TRUNC(TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL interval_in_days DAY), MINUTE), - DATETIME_TRUNC(CURRENT_TIMESTAMP(), MINUTE), - INTERVAL 1 MINUTE)) AS timeInterval + period_start, + user_email + ORDER BY + period_start DESC ), - joined AS ( + time_series AS( + SELECT + * + FROM + UNNEST(generate_timestamp_array(DATETIME_TRUNC(TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL interval_in_days DAY), MINUTE), + DATETIME_TRUNC(CURRENT_TIMESTAMP(), MINUTE), + INTERVAL 1 MINUTE)) AS timeInterval + ), + joined AS ( SELECT - user, - COALESCE(slotUsage, 0) as slotUsage, + COALESCE(src.slot_usage, 0) as slot_usage, + user_email, timeInterval FROM - src RIGHT OUTER JOIN timeSeries - ON creationTime = timeInterval) + src RIGHT OUTER JOIN time_series + ON period_start = timeInterval + ) SELECT - slotUsage, - user, - timeInterval + * FROM joined ORDER BY diff --git a/information_schema/slots_by_second.sql b/information_schema/slots_by_second.sql index dda80c3..b163697 100644 --- a/information_schema/slots_by_second.sql +++ b/information_schema/slots_by_second.sql @@ -3,24 +3,24 @@ -- Change this value to change how far in the past the query will search DECLARE interval_in_days INT64 DEFAULT 7; -DECLARE time_period INT64; -SET time_period = (1000); -- Number of milliseconds in a second +DECLARE time_period INT64 DEFAULT (1000); -- Number of milliseconds in a second BEGIN WITH src AS ( SELECT - SAFE_DIVIDE(SUM(total_slot_ms), time_period) AS slotUsage, - DATETIME_TRUNC(creation_time, - SECOND) AS creationTime + SAFE_DIVIDE(SUM(period_slot_ms), time_period) AS slotUsage, -- Divide by 1 second (1000 ms) to convert to slots/second + period_start FROM - ``.``.INFORMATION_SCHEMA.JOBS_BY_PROJECT + ``.``.INFORMATION_SCHEMA.JOBS_TIMELINE WHERE - creation_time BETWEEN TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL interval_in_days DAY) - AND CURRENT_TIMESTAMP() - AND total_slot_ms IS NOT NULL + period_start BETWEEN TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL interval_in_days DAY) AND CURRENT_TIMESTAMP() + AND statement_type <> 'SCRIPT' -- Exclude scripts since they pull in child process slots and skew results GROUP BY - creationTime), - timeSeries AS( + period_start + ORDER BY + period_start DESC + ), + timeSeries AS( SELECT * FROM @@ -34,7 +34,8 @@ BEGIN timeInterval FROM src RIGHT OUTER JOIN timeSeries - ON creationTime = timeInterval) + ON period_start = timeInterval + ) SELECT * diff --git a/information_schema/table_storage_by_most_recently_queried.sql b/information_schema/table_storage_by_most_recently_queried.sql new file mode 100644 index 0000000..a898dbd --- /dev/null +++ b/information_schema/table_storage_by_most_recently_queried.sql @@ -0,0 +1,202 @@ +/* + * This query retrieves the table(s) each job references and joins it to that table's storage size along with storing the last queried time during the range. + * It then calculates the price for that table and displays it along with the storage data. + + * Instructions for use: + * 1. Modify the and placeholders below to match your required values. + * 2. Change the interval_in_days value to travel back further in time. By default this is 14 days. + + * Note: If not using the US or EU multi-regions, then the storage costs may be different. + * Change the storage price values below to match the actual cost listed here: + * https://cloud.google.com/bigquery/pricing?hl=en#storage-pricing + */ + +-- Change this value to change how far in the past the query will search +DECLARE interval_in_days INT64 DEFAULT 14; + +-- Update this with the correct values for your region (default is for US multi-region) +-- EU multi-region values are below. +-- This is for price per gib monhtly (not hourly) +-- Can be found here: https://cloud.google.com/bigquery/pricing?hl=en#storage-pricing +DECLARE active_logical_price NUMERIC DEFAULT 0.02; +DECLARE long_term_logical_price NUMERIC DEFAULT 0.01; +DECLARE active_physical_price NUMERIC DEFAULT 0.04; +DECLARE long_term_physical_price NUMERIC DEFAULT 0.02; + +-- These values are for the EU multi-region +-- Comment these out and uncomment above if using the US multi-region +/* +DECLARE active_logical_price NUMERIC DEFAULT 0.02; +DECLARE long_term_logical_price NUMERIC DEFAULT 0.01; +DECLARE active_physical_price NUMERIC DEFAULT 0.044; +DECLARE long_term_physical_price NUMERIC DEFAULT 0.022; +*/ + +WITH tables AS ( + -- Basic table data and last query usage + SELECT + -- Create the fully qualified table name for joining + t_ref.project_id, + t_ref.dataset_id, + t_ref.table_id, + + -- Using creation time as the last queried time since this is the partitioned column in the JOBS view + MAX(j.creation_time) AS last_queried_time + FROM + ``.``.INFORMATION_SCHEMA.JOBS_BY_PROJECT AS j, + -- Extract the referenced tables + UNNEST(j.referenced_tables) AS t_ref + WHERE + -- Only grab successful query jobs + j.job_type = 'QUERY' + AND j.state = 'DONE' + + -- Filter on the given range + AND creation_time BETWEEN TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL interval_in_days DAY) + AND CURRENT_TIMESTAMP() + GROUP BY 1, 2, 3 +), +table_billing_type AS ( + -- Table billing type + SELECT + ts.project_id, + ts.table_schema AS dataset_id, + ts.table_name AS table_id, + + -- Get the billing model option. Defaults to LOGICAL if the option is not explicitly set. + IFNULL(topt.option_value, 'LOGICAL') AS storage_billing_model, + + -- Conditionally determine the Total Billed Bytes + CASE + -- Use PHYSICAL bytes if the option is explicitly set to PHYSICAL + WHEN topt.option_value = 'PHYSICAL' THEN ts.total_physical_bytes + -- Otherwise (LOGICAL or NULL/default), use LOGICAL bytes + ELSE ts.total_logical_bytes + END AS total_billed_bytes, + + -- Conditionally determine the Active Billed Bytes + CASE + WHEN topt.option_value = 'PHYSICAL' THEN ts.active_physical_bytes + ELSE ts.active_logical_bytes + END AS active_billed_bytes, + + -- Conditionally determine the Long-Term Billed Bytes + CASE + WHEN topt.option_value = 'PHYSICAL' THEN ts.long_term_physical_bytes + ELSE ts.long_term_logical_bytes + END AS long_term_billed_bytes + + FROM + ``.``.INFORMATION_SCHEMA.TABLE_STORAGE AS ts + INNER JOIN + -- Join with TABLES view to get the original creation time + ``.``.INFORMATION_SCHEMA.TABLES AS t + ON ts.project_id = t.table_catalog + AND ts.table_schema = t.table_schema + AND ts.table_name = t.table_name + LEFT JOIN + -- Left join TABLE_OPTIONS to find the storage billing model (it might be NULL if the default is used or it was converted in July 2023 before this option was added) + ``.``.INFORMATION_SCHEMA.TABLE_OPTIONS AS topt + ON ts.project_id = topt.table_catalog + AND ts.table_schema = topt.table_schema + AND ts.table_name = topt.table_name + AND topt.option_name = 'storage_billing_model' +) + +SELECT + m.project_id, + m.dataset_id, + m.table_id, + -- Build a nice message instead of throwing a NULL value in for last queried time + IF(u.last_queried_time IS NULL, CONCAT('Prior to ', STRING(TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL interval_in_days DAY))), STRING(u.last_queried_time)) AS last_queried_time, + m.storage_billing_model, + + -- Total storage price + CONCAT('$ ', + FORMAT("%'.2f", + -- Adds active + long-term price (accounting for physical or logical) + -- Calculate active price + CASE + WHEN m.storage_billing_model = 'LOGICAL' THEN + (m.active_billed_bytes/POW(1024, 3)) * active_logical_price + ELSE + (m.active_billed_bytes/POW(1024, 3)) * active_physical_price + END + + + -- Calculate long-term price + CASE + WHEN m.storage_billing_model = 'LOGICAL' THEN + (m.long_term_billed_bytes/POW(1024, 3)) * long_term_logical_price + ELSE + (m.long_term_billed_bytes/POW(1024, 3)) * long_term_physical_price + END + ) + ) AS table_storage_price, + + -- Total Billed Size + FORMAT('%.2f %s', + m.total_billed_bytes / POW(1024, + CASE + WHEN m.total_billed_bytes >= POW(1024, 4) THEN 4 -- TiB + WHEN m.total_billed_bytes >= POW(1024, 3) THEN 3 -- GiB + WHEN m.total_billed_bytes >= POW(1024, 2) THEN 2 -- MiB + ELSE 1 -- KB + END + ), + CASE + WHEN m.total_billed_bytes >= POW(1024, 4) THEN 'TiB' + WHEN m.total_billed_bytes >= POW(1024, 3) THEN 'GiB' + WHEN m.total_billed_bytes >= POW(1024, 2) THEN 'MiB' + ELSE 'KB' + END + ) AS readable_total_billed_size, + m.total_billed_bytes, + + -- Active Size + FORMAT('%.2f %s', + m.active_billed_bytes / POW(1024, + CASE + WHEN m.active_billed_bytes >= POW(1024, 4) THEN 4 + WHEN m.active_billed_bytes >= POW(1024, 3) THEN 3 + WHEN m.active_billed_bytes >= POW(1024, 2) THEN 2 + ELSE 1 + END + ), + CASE + WHEN m.active_billed_bytes >= POW(1024, 4) THEN 'TiB' + WHEN m.active_billed_bytes >= POW(1024, 3) THEN 'GiB' + WHEN m.active_billed_bytes >= POW(1024, 2) THEN 'MiB' + ELSE 'KB' + END + ) AS readable_active_size, + m.active_billed_bytes, + + -- Long-Term Size + FORMAT('%.2f %s', + m.long_term_billed_bytes / POW(1024, + CASE + WHEN m.long_term_billed_bytes >= POW(1024, 4) THEN 4 + WHEN m.long_term_billed_bytes >= POW(1024, 3) THEN 3 + WHEN m.long_term_billed_bytes >= POW(1024, 2) THEN 2 + ELSE 1 + END + ), + CASE + WHEN m.long_term_billed_bytes >= POW(1024, 4) THEN 'TiB' + WHEN m.long_term_billed_bytes >= POW(1024, 3) THEN 'GiB' + WHEN m.long_term_billed_bytes >= POW(1024, 2) THEN 'MiB' + ELSE 'KB' + END + ) AS readable_long_term_size +FROM + table_billing_type AS m +LEFT JOIN + tables AS u + ON m.project_id = u.project_id + AND m.dataset_id = u.dataset_id + AND m.table_id = u.table_id +ORDER BY + -- Sort by last queried time (oldest first, with NULLs first) + u.last_queried_time ASC NULLS FIRST, + -- Secondary sort by the determined billed size descending + total_billed_bytes DESC; \ No newline at end of file diff --git a/information_schema/table_usage_and_costs_per_day.sql b/information_schema/table_usage_and_costs_per_day.sql new file mode 100644 index 0000000..7d52246 --- /dev/null +++ b/information_schema/table_usage_and_costs_per_day.sql @@ -0,0 +1,57 @@ +-- This query shows costs and billed-usage per-day per table in your project. +-- It may not be up-to-date as the data is typically 2-3 days behind inside of information schema + +-- Change this value to change how far in the past the query will search +-- Note currently (as of 10/7/2025) this view only supports up to 90-days in the past +DECLARE interval_in_days INT64 DEFAULT 7; + +-- Change these for different regions, this value is for the US multi-region +-- Note these values are in 1 gigibyte (gib) hours +-- These values can be found here: https://cloud.google.com/bigquery/pricing?hl=en#storage-pricing +DECLARE active_logical_storage_per_hour NUMERIC DEFAULT 0.000027397; +DECLARE long_term_logical_storage_per_hour NUMERIC DEFAULT 0.000013699; + +DECLARE active_physical_storage_per_hour NUMERIC DEFAULT 0.000054795; +DECLARE long_term_physical_storage_per_hour NUMERIC DEFAULT 0.000027397; + +SELECT + usage_date, + project_id, + table_schema, + table_name, + SUM(billable_total_logical_usage)/POW(1024, 3) AS billable_total_logical_usage_in_gibytes, + SUM(billable_total_logical_usage)/POW(1024, 4) AS billable_total_logical_usage_in_tibytes, + + SUM(billable_active_logical_usage)/POW(1024, 3) AS billable_active_logical_usage_in_gibytes, + SUM(billable_active_logical_usage)/POW(1024, 4) AS billable_active_logical_usage_in_tibytes, + SUM(billable_active_logical_usage)/POW(1024, 3) * active_logical_storage_per_hour AS active_logical_cost, + CONCAT('$ ',FORMAT("%'.2f", SUM(billable_active_logical_usage)/POW(1024, 3) * active_logical_storage_per_hour)) AS active_logical_pretty_cost, + + SUM(billable_long_term_logical_usage)/POW(1024, 3) AS billable_long_term_logical_usage_in_gibytes, + SUM(billable_long_term_logical_usage)/POW(1024, 4) AS billable_long_term_logical_usage_in_tibytes, + SUM(billable_long_term_logical_usage)/POW(1024, 3) * long_term_logical_storage_per_hour AS long_term_logical_cost, + CONCAT('$ ',FORMAT("%'.2f", SUM(billable_long_term_logical_usage)/POW(1024, 3) * long_term_logical_storage_per_hour)) AS long_term_logical_pretty_cost, + + SUM(billable_total_physical_usage)/POW(1024, 3) AS billable_total_physical_usage_in_gibytes, + SUM(billable_total_physical_usage)/POW(1024, 4) AS billable_total_physical_usage_in_tibytes, + + SUM(billable_active_physical_usage)/POW(1024, 3) AS billable_active_physical_usage_in_gibytes, + SUM(billable_active_physical_usage)/POW(1024, 4) AS billable_active_physical_usage_in_tibytes, + SUM(billable_active_physical_usage)/POW(1024, 3) * active_physical_storage_per_hour AS active_physical_cost, + CONCAT('$ ',FORMAT("%'.2f", SUM(billable_active_physical_usage)/POW(1024, 3) * active_physical_storage_per_hour)) AS long_term_physical_pretty_cost, + + SUM(billable_long_term_physical_usage)/POW(1024, 3) AS billable_long_term_physical_usage_in_gibytes, + SUM(billable_long_term_physical_usage)/POW(1024, 4) AS billable_long_term_physical_usage_in_tibytes, + SUM(billable_long_term_physical_usage)/POW(1024, 3) * long_term_physical_storage_per_hour AS long_term_logical_cost, + CONCAT('$ ',FORMAT("%'.2f", SUM(billable_long_term_physical_usage)/POW(1024, 3) * long_term_physical_storage_per_hour)) AS long_term_physical_pretty_cost +FROM + ``.``.INFORMATION_SCHEMA.TABLE_STORAGE_USAGE_TIMELINE +WHERE + usage_date >= DATE_SUB(CURRENT_DATE(), INTERVAL interval_in_days DAY) +GROUP BY + 1, + 2, + 3, + 4 +ORDER BY + usage_date; \ No newline at end of file diff --git a/information_schema/top_billed_labels.sql b/information_schema/top_billed_labels.sql index 255f589..d3aa72e 100644 --- a/information_schema/top_billed_labels.sql +++ b/information_schema/top_billed_labels.sql @@ -41,7 +41,7 @@ BEGIN ROUND(SUM(COALESCE(totalBilledBytes, 0)) / POW(1024, 4), 2) AS totalTerabytesBilled, ROUND(SAFE_DIVIDE(SUM(COALESCE(totalBilledBytes, 0)), - POW(1024, 4)) * 5, 2) AS onDemandCost, + POW(1024, 4)) * 6.25, 2) AS onDemandCost, labelKey, labelValue FROM jobsDeduplicated @@ -50,4 +50,4 @@ BEGIN labelValue ORDER BY onDemandCost DESC; -END \ No newline at end of file +END diff --git a/information_schema/top_billed_queries.sql b/information_schema/top_billed_queries.sql index 9df7d0b..651e39a 100644 --- a/information_schema/top_billed_queries.sql +++ b/information_schema/top_billed_queries.sql @@ -16,13 +16,13 @@ BEGIN ROUND(COALESCE(total_bytes_billed, 0), 2) AS totalBytesBilled, ROUND(COALESCE(total_bytes_billed, - 0) / POW(1024, 2), 2) AS totalMegabytesBilled, + 0) / POW(1024, 2), 2) AS totalMebibytesBilled, ROUND(COALESCE(total_bytes_billed, - 0) / POW(1024, 3), 2) AS totalGigabytesBilled, + 0) / POW(1024, 3), 2) AS totalGigibytesBilled, ROUND(COALESCE(total_bytes_billed, - 0) / POW(1024, 4), 2) AS totalTerabytesBilled, + 0) / POW(1024, 4), 2) AS totalTebibytesBilled, ROUND(SAFE_DIVIDE(total_bytes_billed, - POW(1024, 4)) * 5, 2) AS cost, + POW(1024, 4)) * 6.25, 2) AS on_demand_cost, ROUND(SAFE_DIVIDE(total_slot_ms, TIMESTAMP_DIFF(end_time, start_time, MILLISECOND)), 2) AS approximateSlotCount, ROW_NUMBER() OVER(PARTITION BY job_id ORDER BY end_time DESC) AS _rnk @@ -49,4 +49,4 @@ FROM jobsDeduplicated ORDER BY cost DESC; -END \ No newline at end of file +END diff --git a/information_schema/top_billed_queries_deduplicated.sql b/information_schema/top_billed_queries_deduplicated.sql index faba08c..24e0340 100644 --- a/information_schema/top_billed_queries_deduplicated.sql +++ b/information_schema/top_billed_queries_deduplicated.sql @@ -18,13 +18,13 @@ BEGIN ROUND(COALESCE(total_bytes_billed, 0), 2) AS totalBytesBilled, ROUND(COALESCE(total_bytes_billed, - 0) / POW(1024, 2), 2) AS totalMegabytesBilled, + 0) / POW(1024, 2), 2) AS totalMebibytesBilled, ROUND(COALESCE(total_bytes_billed, - 0) / POW(1024, 3), 2) AS totalGigabytesBilled, + 0) / POW(1024, 3), 2) AS totalGigibytesBilled, ROUND(COALESCE(total_bytes_billed, - 0) / POW(1024, 4), 2) AS totalTerabytesBilled, + 0) / POW(1024, 4), 2) AS totalTebibytesBilled, ROUND(SAFE_DIVIDE(total_bytes_billed, - POW(1024, 4)) * 5, 2) AS cost, + POW(1024, 4)) * 6.25, 2) AS on_demand_cost, ROUND(SAFE_DIVIDE(total_slot_ms, TIMESTAMP_DIFF(end_time, start_time, MILLISECOND)), 2) AS approximateSlotCount, ROW_NUMBER() OVER(PARTITION BY job_id ORDER BY end_time DESC) AS _rnk, @@ -59,4 +59,4 @@ FROM queriesDeduplicated ORDER BY cost DESC; -END \ No newline at end of file +END diff --git a/information_schema/top_complex_queries.sql b/information_schema/top_complex_queries.sql index 7568d9c..e2fddad 100644 --- a/information_schema/top_complex_queries.sql +++ b/information_schema/top_complex_queries.sql @@ -16,7 +16,7 @@ BEGIN start_time AS startTime, end_time AS endTime, total_bytes_billed AS totalBytesBilled, - ROUND(SAFE_DIVIDE(total_bytes_billed, POW(1024, 4)) * 5, 2) AS onDemandCost, + ROUND(SAFE_DIVIDE(total_bytes_billed, POW(1024, 4)) * 6.25, 2) AS onDemandCost, TIMESTAMP_DIFF(end_time, start_time, SECOND) AS executionTime, TIMESTAMP_DIFF(end_time, start_time, MILLISECOND) AS executionTimeMs, ROW_NUMBER() OVER (PARTITION BY job_id ORDER BY end_time DESC) AS _rnk @@ -42,4 +42,4 @@ FROM jobsDeduplicated ORDER BY approximateSlotCount DESC; -END \ No newline at end of file +END diff --git a/information_schema/top_cost_users.sql b/information_schema/top_cost_users.sql index 561d4fc..51cbd34 100644 --- a/information_schema/top_cost_users.sql +++ b/information_schema/top_cost_users.sql @@ -23,7 +23,7 @@ BEGIN jobsDeduplicated AS ( SELECT user, - ROUND(SAFE_DIVIDE(SUM(src.totalBytesBilled), POW(1024, 4)) * 5, 2) AS onDemandCost + ROUND(SAFE_DIVIDE(SUM(src.totalBytesBilled), POW(1024, 4)) * 6.25, 2) AS onDemandCost FROM src WHERE @@ -38,4 +38,4 @@ FROM ORDER BY user DESC, onDemandCost DESC; -END \ No newline at end of file +END diff --git a/information_schema/top_costly_queries.sql b/information_schema/top_costly_queries.sql index a190658..939bba4 100644 --- a/information_schema/top_costly_queries.sql +++ b/information_schema/top_costly_queries.sql @@ -55,7 +55,7 @@ SELECT ROUND(COALESCE(totalBytesBilled, 0) / POW(1024, 4), 2) AS totalTerabytesBilled, ROUND(SAFE_DIVIDE(totalBytesBilled, - POW(1024, 4)) * 5, 2) AS onDemandCost + POW(1024, 4)) * 6.25, 2) AS onDemandCost FROM hashedQueries ORDER BY diff --git a/information_schema/total_billing_recommendation.sql b/information_schema/total_billing_recommendation.sql new file mode 100644 index 0000000..0a3ddb6 --- /dev/null +++ b/information_schema/total_billing_recommendation.sql @@ -0,0 +1,124 @@ +-- This query aggregates costs across all queries in the specified timeframe and compares +-- on-demand pricing versus different BigQuery Editions pricing models + +-- Change this value to change how far in the past the query will search +DECLARE interval_in_days INT64 DEFAULT 30; + +BEGIN +WITH + src AS ( + SELECT + ROUND(SAFE_DIVIDE(total_slot_ms, + TIMESTAMP_DIFF(end_time, start_time, MILLISECOND)), 2) AS approximateSlotCount, + job_type, + query, + project_id AS projectId, + start_time AS startTime, + end_time AS endTime, + ROUND(COALESCE(total_bytes_billed, + 0), 2) AS totalBytesBilled, + ROUND(COALESCE(total_bytes_billed, + 0) / POW(1024, 2), 2) AS totalMegabytesBilled, + ROUND(COALESCE(total_bytes_billed, + 0) / POW(1024, 3), 2) AS totalGigabytesBilled, + ROUND(COALESCE(total_bytes_billed, + 0) / POW(1024, 4), 2) AS totalTerabytesBilled, + TIMESTAMP_DIFF(end_time, start_time, MILLISECOND) AS executionTimeMs + FROM + ``.``.INFORMATION_SCHEMA.JOBS_BY_PROJECT + WHERE + creation_time BETWEEN TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL interval_in_days DAY) + AND CURRENT_TIMESTAMP() + ), + rounded AS ( + SELECT + *, + -- Rounds up to the nearest 50 slots (autoscaler increments) + floor((CEIL(approximateSlotCount) + 49) / 50) * 50 AS roundedUpSlots, + + -- If query ran in under 1 minute (60 seconds * 1000 ms) then round up to 1 minute + IF(executionTimeMs < 1000*60, 1000*60, executionTimeMs) AS billedDurationMs, + -- Calculates the duration in hours for calculating slot/hours used + -- Formula: (Execution Time in ms)/(1000 ms * 60 seconds * 60 minutes) + IF(executionTimeMs < 1000*60, 1000*60, executionTimeMs)/(1000*60*60) AS billedDurationHour, + + -- Apply minimum 10 MiB billing per query + GREATEST(totalBytesBilled, 10 * POW(1024, 2)) AS billedBytes, + GREATEST(totalMegabytesBilled, 10) AS billedMegabytes, + GREATEST(totalGigabytesBilled, 10/1024) AS billedGigabytes, + GREATEST(totalTerabytesBilled, 10/POW(1024, 2)) AS billedTerabytes + FROM src + ), + costs AS ( + SELECT + *, + SAFE_DIVIDE(billedBytes, + POW(1024, 4)) * 6.25 AS onDemandCost, + -- Multiply by roundedUpSlots to correctly calculate slot-hours cost + roundedUpSlots * billedDurationHour * 0.04 AS standardEditionCost, + roundedUpSlots * billedDurationHour * 0.06 AS enterpriseEditionCost, + roundedUpSlots * billedDurationHour * 0.048 AS enterpriseEdition1YearCost, + roundedUpSlots * billedDurationHour * 0.036 AS enterpriseEdition3YearCost, + roundedUpSlots * billedDurationHour * 0.1 AS enterprisePlusEditionCost, + roundedUpSlots * billedDurationHour * 0.08 AS enterprisePlusEdition1YearCost, + roundedUpSlots * billedDurationHour * 0.06 AS enterprisePlusEdition3YearCost + FROM + rounded + ) + +-- Aggregate costs across all queries +SELECT + -- Time period information + CONCAT('Last ', interval_in_days, ' days (', + FORMAT_TIMESTAMP('%Y-%m-%d', TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL interval_in_days DAY)), + ' to ', + FORMAT_TIMESTAMP('%Y-%m-%d', CURRENT_TIMESTAMP()), + ')') AS time_period, + + -- Query count + COUNT(DISTINCT query) AS total_unique_queries, + COUNT(*) AS total_query_executions, + + -- Total bytes processed + SUM(billedBytes) AS total_bytes_billed, + ROUND(SUM(billedBytes) / POW(1024, 4), 2) AS total_terabytes_billed, + + -- Total slot hours + SUM(billedDurationHour) AS total_billed_slot_hours, + + -- Aggregated costs + ROUND(SUM(onDemandCost), 2) AS total_on_demand_cost, + ROUND(SUM(standardEditionCost), 2) AS total_standard_edition_cost, + ROUND(SUM(enterpriseEditionCost), 2) AS total_enterprise_edition_cost, + ROUND(SUM(enterpriseEdition1YearCost), 2) AS total_enterprise_1year_cost, + ROUND(SUM(enterpriseEdition3YearCost), 2) AS total_enterprise_3year_cost, + ROUND(SUM(enterprisePlusEditionCost), 2) AS total_enterprise_plus_cost, + ROUND(SUM(enterprisePlusEdition1YearCost), 2) AS total_enterprise_plus_1year_cost, + ROUND(SUM(enterprisePlusEdition3YearCost), 2) AS total_enterprise_plus_3year_cost, + + -- Cost comparisons (positive means on-demand is more expensive) + ROUND(SUM(onDemandCost) - SUM(standardEditionCost), 2) AS on_demand_vs_standard_diff, + ROUND(SUM(onDemandCost) - SUM(enterpriseEditionCost), 2) AS on_demand_vs_enterprise_diff, + ROUND(SUM(onDemandCost) - SUM(enterpriseEdition1YearCost), 2) AS on_demand_vs_enterprise_1year_diff, + ROUND(SUM(onDemandCost) - SUM(enterpriseEdition3YearCost), 2) AS on_demand_vs_enterprise_3year_diff, + + -- Cost savings percentages + ROUND(100 * (SUM(onDemandCost) - SUM(standardEditionCost)) / NULLIF(SUM(onDemandCost), 0), 2) AS standard_edition_savings_pct, + ROUND(100 * (SUM(onDemandCost) - SUM(enterpriseEditionCost)) / NULLIF(SUM(onDemandCost), 0), 2) AS enterprise_edition_savings_pct, + ROUND(100 * (SUM(onDemandCost) - SUM(enterpriseEdition1YearCost)) / NULLIF(SUM(onDemandCost), 0), 2) AS enterprise_1year_savings_pct, + ROUND(100 * (SUM(onDemandCost) - SUM(enterpriseEdition3YearCost)) / NULLIF(SUM(onDemandCost), 0), 2) AS enterprise_3year_savings_pct, + + -- Overall recommendation + CASE + WHEN SUM(onDemandCost) < SUM(standardEditionCost) THEN 'On-demand pricing recommended' + WHEN SUM(enterpriseEdition3YearCost) < SUM(standardEditionCost) AND + SUM(enterpriseEdition3YearCost) < SUM(enterpriseEdition1YearCost) AND + SUM(enterpriseEdition3YearCost) < SUM(enterpriseEditionCost) THEN 'Enterprise Edition with 3-year commitment recommended' + WHEN SUM(enterpriseEdition1YearCost) < SUM(standardEditionCost) AND + SUM(enterpriseEdition1YearCost) < SUM(enterpriseEditionCost) THEN 'Enterprise Edition with 1-year commitment recommended' + WHEN SUM(enterpriseEditionCost) < SUM(standardEditionCost) THEN 'Enterprise Edition recommended' + ELSE 'Standard Edition recommended' + END AS overall_recommendation +FROM + costs; +END