Skip to content

Commit a1b7341

Browse files
mskrzypkowsMaciej Skrzypkowski
andauthored
Non-Latin characters support (apache#840)
* Non latin characters --------- Co-authored-by: Maciej Skrzypkowski <[email protected]> * Test for mysql --------- Co-authored-by: Maciej Skrzypkowski <[email protected]>
1 parent eb67d48 commit a1b7341

File tree

6 files changed

+34
-30
lines changed

6 files changed

+34
-30
lines changed

src/dialect/generic.rs

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17,12 +17,11 @@ pub struct GenericDialect;
1717

1818
impl Dialect for GenericDialect {
1919
fn is_identifier_start(&self, ch: char) -> bool {
20-
ch.is_ascii_lowercase() || ch.is_ascii_uppercase() || ch == '_' || ch == '#' || ch == '@'
20+
ch.is_alphabetic() || ch == '_' || ch == '#' || ch == '@'
2121
}
2222

2323
fn is_identifier_part(&self, ch: char) -> bool {
24-
ch.is_ascii_lowercase()
25-
|| ch.is_ascii_uppercase()
24+
ch.is_alphabetic()
2625
|| ch.is_ascii_digit()
2726
|| ch == '@'
2827
|| ch == '$'

src/dialect/mssql.rs

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -22,13 +22,11 @@ impl Dialect for MsSqlDialect {
2222

2323
fn is_identifier_start(&self, ch: char) -> bool {
2424
// See https://docs.microsoft.com/en-us/sql/relational-databases/databases/database-identifiers?view=sql-server-2017#rules-for-regular-identifiers
25-
// We don't support non-latin "letters" currently.
26-
ch.is_ascii_lowercase() || ch.is_ascii_uppercase() || ch == '_' || ch == '#' || ch == '@'
25+
ch.is_alphabetic() || ch == '_' || ch == '#' || ch == '@'
2726
}
2827

2928
fn is_identifier_part(&self, ch: char) -> bool {
30-
ch.is_ascii_lowercase()
31-
|| ch.is_ascii_uppercase()
29+
ch.is_alphabetic()
3230
|| ch.is_ascii_digit()
3331
|| ch == '@'
3432
|| ch == '$'

src/dialect/mysql.rs

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,8 +20,7 @@ impl Dialect for MySqlDialect {
2020
// See https://dev.mysql.com/doc/refman/8.0/en/identifiers.html.
2121
// We don't yet support identifiers beginning with numbers, as that
2222
// makes it hard to distinguish numeric literals.
23-
ch.is_ascii_lowercase()
24-
|| ch.is_ascii_uppercase()
23+
ch.is_alphabetic()
2524
|| ch == '_'
2625
|| ch == '$'
2726
|| ch == '@'

src/dialect/postgresql.rs

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -23,16 +23,12 @@ impl Dialect for PostgreSqlDialect {
2323
fn is_identifier_start(&self, ch: char) -> bool {
2424
// See https://www.postgresql.org/docs/11/sql-syntax-lexical.html#SQL-SYNTAX-IDENTIFIERS
2525
// We don't yet support identifiers beginning with "letters with
26-
// diacritical marks and non-Latin letters"
27-
ch.is_ascii_lowercase() || ch.is_ascii_uppercase() || ch == '_'
26+
// diacritical marks"
27+
ch.is_alphabetic() || ch == '_'
2828
}
2929

3030
fn is_identifier_part(&self, ch: char) -> bool {
31-
ch.is_ascii_lowercase()
32-
|| ch.is_ascii_uppercase()
33-
|| ch.is_ascii_digit()
34-
|| ch == '$'
35-
|| ch == '_'
31+
ch.is_alphabetic() || ch.is_ascii_digit() || ch == '$' || ch == '_'
3632
}
3733

3834
fn parse_statement(&self, parser: &mut Parser) -> Option<Result<Statement, ParserError>> {

src/tokenizer.rs

Lines changed: 6 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1519,20 +1519,16 @@ mod tests {
15191519

15201520
#[test]
15211521
fn tokenize_invalid_string() {
1522-
let sql = String::from("\nمصطفىh");
1522+
let sql = String::from("\n💝مصطفىh");
15231523

15241524
let dialect = GenericDialect {};
15251525
let mut tokenizer = Tokenizer::new(&dialect, &sql);
15261526
let tokens = tokenizer.tokenize().unwrap();
15271527
// println!("tokens: {:#?}", tokens);
15281528
let expected = vec![
15291529
Token::Whitespace(Whitespace::Newline),
1530-
Token::Char('م'),
1531-
Token::Char('ص'),
1532-
Token::Char('ط'),
1533-
Token::Char('ف'),
1534-
Token::Char('ى'),
1535-
Token::make_word("h", None),
1530+
Token::Char('💝'),
1531+
Token::make_word("مصطفىh", None),
15361532
];
15371533
compare(expected, tokens);
15381534
}
@@ -1582,7 +1578,7 @@ mod tests {
15821578

15831579
#[test]
15841580
fn tokenize_invalid_string_cols() {
1585-
let sql = String::from("\n\nSELECT * FROM table\tمصطفىh");
1581+
let sql = String::from("\n\nSELECT * FROM table\t💝مصطفىh");
15861582

15871583
let dialect = GenericDialect {};
15881584
let mut tokenizer = Tokenizer::new(&dialect, &sql);
@@ -1599,12 +1595,8 @@ mod tests {
15991595
Token::Whitespace(Whitespace::Space),
16001596
Token::make_keyword("table"),
16011597
Token::Whitespace(Whitespace::Tab),
1602-
Token::Char('م'),
1603-
Token::Char('ص'),
1604-
Token::Char('ط'),
1605-
Token::Char('ف'),
1606-
Token::Char('ى'),
1607-
Token::make_word("h", None),
1598+
Token::Char('💝'),
1599+
Token::make_word("مصطفىh", None),
16081600
];
16091601
compare(expected, tokens);
16101602
}

tests/sqlparser_common.rs

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6734,3 +6734,23 @@ fn make_where_clause(num: usize) -> String {
67346734
}
67356735
output
67366736
}
6737+
6738+
#[test]
6739+
fn parse_non_latin_identifiers() {
6740+
let supported_dialects = TestedDialects {
6741+
dialects: vec![
6742+
Box::new(GenericDialect {}),
6743+
Box::new(PostgreSqlDialect {}),
6744+
Box::new(MsSqlDialect {}),
6745+
Box::new(RedshiftSqlDialect {}),
6746+
Box::new(MySqlDialect {}),
6747+
],
6748+
};
6749+
6750+
supported_dialects.verified_stmt("SELECT a.説明 FROM test.public.inter01 AS a");
6751+
supported_dialects.verified_stmt("SELECT a.説明 FROM inter01 AS a, inter01_transactions AS b WHERE a.説明 = b.取引 GROUP BY a.説明");
6752+
supported_dialects.verified_stmt("SELECT 説明, hühnervögel, garçon, Москва, 東京 FROM inter01");
6753+
assert!(supported_dialects
6754+
.parse_sql_statements("SELECT 💝 FROM table1")
6755+
.is_err());
6756+
}

0 commit comments

Comments
 (0)