BUG#35278365: Fix UnicodeDecodeError with a long field name alias (c-ext)

nmariz · nmariz · commit 34a00b1f038d · 2023-06-02T17:04:56.000+01:00
An UnicodeDecodeError is raised when using a complex query that produces
a long field name alias.
It fails to create an Unicode object using `PyUnicode_Decode()` from the
resulting `MYSQL_FIELD.name` returned by `mysql_fetch_fields()` MySQL
C API.

This patch uses "replace" in `PyUnicode_Decode()` to set how decoding
errors are handled, which uses a replace marker.

Change-Id: Ifbeb8572e3c906fccbdb0b291e903775d132b494
diff --git a/CHANGES.txt b/CHANGES.txt
@@ -19,6 +19,7 @@ v8.1.0
 - BUG#35349093: Compression doesn't work with C extension API
 - BUG#35338384: PIP installs incompatible Connector/Python packages
 - BUG#35318413: Fix charset mapping for MySQL 8.1.0
+- BUG#35278365: Fix UnicodeDecodeError with a long field name alias (c-ext)
 - BUG#35212199: Check for identifier quotes in the database name
 - BUG#35140271: Regex split hanging in cursor.execute(..., multi=True) for complex queries
 - BUG#29115406: CONTRIBUTION - FIX RECV COMPRESS BUG
diff --git a/src/mysql_capi_conversion.c b/src/mysql_capi_conversion.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2014, 2022, Oracle and/or its affiliates.
+ * Copyright (c) 2014, 2023, Oracle and/or its affiliates.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License, version 2.0, as
@@ -676,7 +676,7 @@ mytopy_string(const char *data, enum_field_types field_type,
 
     /* 'binary' charset = 63 */
     if (use_unicode && (field_type == MYSQL_TYPE_JSON || field_charsetnr != 63)) {
-        return PyUnicode_Decode(data, field_length, charset, NULL);
+        return PyUnicode_Decode(data, field_length, charset, "replace");
     }
 
     return PyByteArray_FromStringAndSize(data, field_length);
diff --git a/tests/test_bugs.py b/tests/test_bugs.py
@@ -7587,3 +7587,83 @@ def test_set_compress_cext_false(self):
         """Test setting `compress=False` in the C extension directly."""
         res = self._test_compression_status_cext(False)
         self.assertEqual(res, ("Compression", "OFF"))
+
+
+class BugOra35278365(tests.MySQLConnectorTests):
+    """BUG#35278365: Fix UnicodeDecodeError with a long field name alias (c-ext)
+
+    An UnicodeDecodeError is raised when using a complex query that produces
+    a long field name alias.
+    It fails to create an Unicode object using `PyUnicode_Decode()` from the
+    resulting `MYSQL_FIELD.name` returned by `mysql_fetch_fields()` MySQL
+    C API.
+
+    This patch uses "replace" in `PyUnicode_Decode()` to set how decoding
+    errors are handled, which uses a replace marker.
+    """
+
+    tbl_prefix = "BugOra35278365"
+
+    def setUp(self):
+        config = tests.get_mysql_config()
+        with mysql.connector.connect(**config) as cnx:
+            with cnx.cursor() as cur:
+                cnx.cmd_query(f"DROP TABLE IF EXISTS {self.tbl_prefix}_table1")
+                cnx.cmd_query(f"DROP TABLE IF EXISTS {self.tbl_prefix}_table2")
+                cnx.cmd_query(f"DROP TABLE IF EXISTS {self.tbl_prefix}_table3")
+                cur.execute(
+                    f"""
+                    CREATE TABLE {self.tbl_prefix}_table1 (
+                    id int(11) NOT NULL,
+                    ort_id int(11) DEFAULT NULL
+                    ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_general_ci
+                    """
+                )
+                cur.execute(
+                    f"""
+                    CREATE TABLE {self.tbl_prefix}_table2 (
+                    id int(11) NOT NULL
+                    ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_general_ci
+                    """
+                )
+                cur.execute(
+                    f"""
+                    CREATE TABLE {self.tbl_prefix}_table3 (
+                    besuch_id int(11) NOT NULL,
+                    taxon varchar(30) NOT NULL,
+                    epitheton varchar(30) DEFAULT NULL,
+                    rang int(1) NOT NULL
+                    ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_general_ci
+                    """
+                )
+                cnx.commit()
+
+    def tearDown(self):
+        config = tests.get_mysql_config()
+        with mysql.connector.connect(**config) as cnx:
+            cnx.cmd_query(f"DROP TABLE IF EXISTS {self.tbl_prefix}_table1")
+            cnx.cmd_query(f"DROP TABLE IF EXISTS {self.tbl_prefix}_table2")
+            cnx.cmd_query(f"DROP TABLE IF EXISTS {self.tbl_prefix}_table3")
+            cnx.commit()
+
+    @foreach_cnx()
+    def test_long_field_names(self):
+        with mysql.connector.connect(**tests.get_mysql_config()) as cnx:
+            with cnx.cursor() as cur:
+                query = f"""
+                    SELECT (SELECT Group_concat(DISTINCT Concat_ws(' ', taxon,
+                                                Ifnull(t3.epitheton, 'sp.'))
+                        ORDER BY
+                            t3.taxon, Isnull(t3.epitheton), t3.epitheton SEPARATOR ', ')
+                            FROM   {self.tbl_prefix}_table3 t3
+                            WHERE  t3.besuch_id = {self.tbl_prefix}_table1.id
+                            AND t3.rang IN (1, 2)
+                            AND NOT EXISTS(
+                                SELECT 0 FROM {self.tbl_prefix}_table3 b3)), 1, 1
+                    FROM   {self.tbl_prefix}_table1,{self.tbl_prefix}_table2
+                    WHERE  {self.tbl_prefix}_table2.id = {self.tbl_prefix}_table1.ort_id
+                    ORDER  BY {self.tbl_prefix}_table1.id
+                """
+                cur.execute(query)  # No error is success
+                res = cur.fetchall()
+                self.assertEqual(res, [])

Original file line number	Diff line number	Diff line change
`@@ -1,5 +1,5 @@`
`1`	`1`	`/*`
`2`		`- * Copyright (c) 2014, 2022, Oracle and/or its affiliates.`
	`2`	`+ * Copyright (c) 2014, 2023, Oracle and/or its affiliates.`
`3`	`3`	`*`
`4`	`4`	`* This program is free software; you can redistribute it and/or modify`
`5`	`5`	`* it under the terms of the GNU General Public License, version 2.0, as`
`@@ -676,7 +676,7 @@ mytopy_string(const char *data, enum_field_types field_type,`
`676`	`676`
`677`	`677`	`/* 'binary' charset = 63 */`
`678`	`678`	`if (use_unicode && (field_type == MYSQL_TYPE_JSON \|\| field_charsetnr != 63)) {`
`679`		`- return PyUnicode_Decode(data, field_length, charset, NULL);`
	`679`	`+ return PyUnicode_Decode(data, field_length, charset, "replace");`
`680`	`680`	`}`
`681`	`681`
`682`	`682`	`return PyByteArray_FromStringAndSize(data, field_length);`