diff --git a/.gitignore b/.gitignore index 2bc52715..644d7186 100644 --- a/.gitignore +++ b/.gitignore @@ -147,3 +147,6 @@ benchmark_*.png # VSCode .vscode + +# History +.history \ No newline at end of file diff --git a/README.md b/README.md index 7f04dec8..fb422a47 100644 --- a/README.md +++ b/README.md @@ -72,7 +72,7 @@ More information about the algorithm and performance considerations can be found # Get started ## Validating dbt model changes between dev and prod -⚡ Looking to use `data-diff` in dbt development? Head over to [our `data-diff` + `dbt` documentation](https://docs.datafold.com/development_testing/open_source/) to get started! +⚡ Looking to use `data-diff` in dbt development? Head over to [our `data-diff` + `dbt` documentation](https://docs.datafold.com/development_testing/how_it_works) to get started! ## Compare data tables between databases 🔀 To compare data between databases, install `data-diff` with specific database adapters, e.g.: diff --git a/data_diff/databases/base.py b/data_diff/databases/base.py index 871c650d..c5931979 100644 --- a/data_diff/databases/base.py +++ b/data_diff/databases/base.py @@ -800,6 +800,12 @@ def normalize_timestamp(self, value: str, coltype: TemporalType) -> str: Date format: ``YYYY-MM-DD HH:mm:SS.FFFFFF`` Precision of dates should be rounded up/down according to coltype.rounds + e.g. precision 3 and coltype.rounds: + - 1969-12-31 23:59:59.999999 -> 1970-01-01 00:00:00.000000 + - 1970-01-01 00:00:00.000888 -> 1970-01-01 00:00:00.001000 + - 1970-01-01 00:00:00.123123 -> 1970-01-01 00:00:00.123000 + + Make sure NULLs remain NULLs """ @abstractmethod diff --git a/data_diff/databases/postgresql.py b/data_diff/databases/postgresql.py index 4b9e945f..d29fa0eb 100644 --- a/data_diff/databases/postgresql.py +++ b/data_diff/databases/postgresql.py @@ -102,13 +102,40 @@ def md5_as_hex(self, s: str) -> str: return f"md5({s})" def normalize_timestamp(self, value: str, coltype: TemporalType) -> str: + def _add_padding(coltype: TemporalType, timestamp6: str): + return f"RPAD(LEFT({timestamp6}, {TIMESTAMP_PRECISION_POS+coltype.precision}), {TIMESTAMP_PRECISION_POS+6}, '0')" + if coltype.rounds: - return f"to_char({value}::timestamp({coltype.precision}), 'YYYY-mm-dd HH24:MI:SS.US')" + # NULL value expected to return NULL after normalization + null_case_begin = f"CASE WHEN {value} IS NULL THEN NULL ELSE " + null_case_end = "END" + + # 294277 or 4714 BC would be out of range, make sure we can't round to that + # TODO test timezones for overflow? + max_timestamp = "294276-12-31 23:59:59.0000" + min_timestamp = "4713-01-01 00:00:00.00 BC" + timestamp = f"least('{max_timestamp}'::timestamp(6), {value}::timestamp(6))" + timestamp = f"greatest('{min_timestamp}'::timestamp(6), {timestamp})" + + interval = format((0.5 * (10 ** (-coltype.precision))), f".{coltype.precision+1}f") + + rounded_timestamp = ( + f"left(to_char(least('{max_timestamp}'::timestamp, {timestamp})" + f"+ interval '{interval}', 'YYYY-mm-dd HH24:MI:SS.US')," + f"length(to_char(least('{max_timestamp}'::timestamp, {timestamp})" + f"+ interval '{interval}', 'YYYY-mm-dd HH24:MI:SS.US')) - (6-{coltype.precision}))" + ) - timestamp6 = f"to_char({value}::timestamp(6), 'YYYY-mm-dd HH24:MI:SS.US')" - return ( - f"RPAD(LEFT({timestamp6}, {TIMESTAMP_PRECISION_POS+coltype.precision}), {TIMESTAMP_PRECISION_POS+6}, '0')" - ) + padded = _add_padding(coltype, rounded_timestamp) + return f"{null_case_begin} {padded} {null_case_end}" + + # TODO years with > 4 digits not padded correctly + # current w/ precision 6: 294276-12-31 23:59:59.0000 + # should be 294276-12-31 23:59:59.000000 + else: + rounded_timestamp = f"to_char({value}::timestamp(6), 'YYYY-mm-dd HH24:MI:SS.US')" + padded = _add_padding(coltype, rounded_timestamp) + return padded def normalize_number(self, value: str, coltype: FractionalType) -> str: return self.to_string(f"{value}::decimal(38, {coltype.precision})") diff --git a/data_diff/databases/redshift.py b/data_diff/databases/redshift.py index dcf061c4..d31258e1 100644 --- a/data_diff/databases/redshift.py +++ b/data_diff/databases/redshift.py @@ -51,26 +51,6 @@ def md5_as_int(self, s: str) -> str: def md5_as_hex(self, s: str) -> str: return f"md5({s})" - def normalize_timestamp(self, value: str, coltype: TemporalType) -> str: - if coltype.rounds: - timestamp = f"{value}::timestamp(6)" - # Get seconds since epoch. Redshift doesn't support milli- or micro-seconds. - secs = f"timestamp 'epoch' + round(extract(epoch from {timestamp})::decimal(38)" - # Get the milliseconds from timestamp. - ms = f"extract(ms from {timestamp})" - # Get the microseconds from timestamp, without the milliseconds! - us = f"extract(us from {timestamp})" - # epoch = Total time since epoch in microseconds. - epoch = f"{secs}*1000000 + {ms}*1000 + {us}" - timestamp6 = ( - f"to_char({epoch}, -6+{coltype.precision}) * interval '0.000001 seconds', 'YYYY-mm-dd HH24:MI:SS.US')" - ) - else: - timestamp6 = f"to_char({value}::timestamp(6), 'YYYY-mm-dd HH24:MI:SS.US')" - return ( - f"RPAD(LEFT({timestamp6}, {TIMESTAMP_PRECISION_POS+coltype.precision}), {TIMESTAMP_PRECISION_POS+6}, '0')" - ) - def normalize_number(self, value: str, coltype: FractionalType) -> str: return self.to_string(f"{value}::decimal(38,{coltype.precision})") diff --git a/data_diff/dbt_config_validators.py b/data_diff/dbt_config_validators.py index 5258a0ca..e7c548a4 100644 --- a/data_diff/dbt_config_validators.py +++ b/data_diff/dbt_config_validators.py @@ -31,7 +31,7 @@ class DependsOn(BaseModel): resource_type: str name: str alias: str - database: str + database: Optional[str] schema_: str = Field(..., alias="schema") columns: Optional[Dict[str, Column]] meta: Dict[str, Any] diff --git a/data_diff/version.py b/data_diff/version.py index 0db237a6..e1170d35 100644 --- a/data_diff/version.py +++ b/data_diff/version.py @@ -1 +1 @@ -__version__ = "0.9.16" +__version__ = "0.9.17" diff --git a/pyproject.toml b/pyproject.toml index 9394dd8b..c3448c01 100755 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "data-diff" -version = "0.9.16" +version = "0.9.17" description = "Command-line tool and Python library to efficiently diff rows across two different databases." authors = ["Datafold "] license = "MIT"