Skip to content

Commit e31d29f

Browse files
ManfredNZKoz
authored andcommitted
Add methods for string verification and encoding cleanup code.
Signed-off-by: Michael Koziarski <[email protected]>
1 parent 7192691 commit e31d29f

File tree

4 files changed

+165
-11
lines changed

4 files changed

+165
-11
lines changed

activesupport/lib/active_support/multibyte.rb

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,25 @@ module Multibyte #:nodoc:
33
DEFAULT_NORMALIZATION_FORM = :kc
44
NORMALIZATIONS_FORMS = [:c, :kc, :d, :kd]
55
UNICODE_VERSION = '5.0.0'
6+
7+
# Regular expressions that describe valid byte sequences for a character
8+
VALID_CHARACTER = {
9+
# Borrowed from the Kconv library by Shinji KONO - (also as seen on the W3C site)
10+
'UTF-8' => /\A(?:
11+
[\x00-\x7f] |
12+
[\xc2-\xdf] [\x80-\xbf] |
13+
\xe0 [\xa0-\xbf] [\x80-\xbf] |
14+
[\xe1-\xef] [\x80-\xbf] [\x80-\xbf] |
15+
\xf0 [\x90-\xbf] [\x80-\xbf] [\x80-\xbf] |
16+
[\xf1-\xf3] [\x80-\xbf] [\x80-\xbf] [\x80-\xbf] |
17+
\xf4 [\x80-\x8f] [\x80-\xbf] [\x80-\xbf])\z /xn,
18+
# Quick check for valid Shift-JIS characters, disregards the odd-even pairing
19+
'Shift_JIS' => /\A(?:
20+
[\x00-\x7e \xa1-\xdf] |
21+
[\x81-\x9f \xe0-\xef] [\x40-\x7e \x80-\x9e \x9f-\xfc])\z /xn
22+
}
623
end
724
end
825

926
require 'active_support/multibyte/chars'
27+
require 'active_support/multibyte/utils'

activesupport/lib/active_support/multibyte/handlers/utf8_handler.rb

Lines changed: 2 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -100,16 +100,7 @@ class UTF8Handler
100100
# between little and big endian. This is not an issue in utf-8, so it must be ignored.
101101
UNICODE_LEADERS_AND_TRAILERS = UNICODE_WHITESPACE + [65279] # ZERO-WIDTH NO-BREAK SPACE aka BOM
102102

103-
# Borrowed from the Kconv library by Shinji KONO - (also as seen on the W3C site)
104-
UTF8_PAT = /\A(?:
105-
[\x00-\x7f] |
106-
[\xc2-\xdf] [\x80-\xbf] |
107-
\xe0 [\xa0-\xbf] [\x80-\xbf] |
108-
[\xe1-\xef] [\x80-\xbf] [\x80-\xbf] |
109-
\xf0 [\x90-\xbf] [\x80-\xbf] [\x80-\xbf] |
110-
[\xf1-\xf3] [\x80-\xbf] [\x80-\xbf] [\x80-\xbf] |
111-
\xf4 [\x80-\x8f] [\x80-\xbf] [\x80-\xbf]
112-
)*\z/xn
103+
UTF8_PAT = ActiveSupport::Multibyte::VALID_CHARACTER['UTF-8']
113104

114105
# Returns a regular expression pattern that matches the passed Unicode codepoints
115106
def self.codepoints_to_pattern(array_of_codepoints) #:nodoc:
@@ -357,7 +348,7 @@ def g_length(str)
357348
# Replaces all the non-utf-8 bytes by their iso-8859-1 or cp1252 equivalent resulting in a valid utf-8 string
358349
def tidy_bytes(str)
359350
str.split(//u).map do |c|
360-
if !UTF8_PAT.match(c)
351+
if !ActiveSupport::Multibyte::VALID_CHARACTER['UTF-8'].match(c)
361352
n = c.unpack('C')[0]
362353
n < 128 ? n.chr :
363354
n < 160 ? [UCD.cp1252[n] || n].pack('U') :
Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
module ActiveSupport #:nodoc:
2+
module Multibyte #:nodoc:
3+
# Returns a regular expression that matches valid characters in the current encoding
4+
def self.valid_character
5+
case $KCODE
6+
when 'UTF8'
7+
VALID_CHARACTER['UTF-8']
8+
when 'SJIS'
9+
VALID_CHARACTER['Shift_JIS']
10+
end
11+
end
12+
13+
# Verifies the encoding of a string
14+
def self.verify(string)
15+
if expression = valid_character
16+
for c in string.split(//)
17+
return false unless valid_character.match(c)
18+
end
19+
end
20+
true
21+
end
22+
23+
# Verifies the encoding of the string and raises an exception when it's not valid
24+
def self.verify!(string)
25+
raise ActiveSupport::Multibyte::Handlers::EncodingError.new("Found characters with invalid encoding") unless verify(string)
26+
end
27+
28+
# Removes all invalid characters from the string
29+
def self.clean(string)
30+
if expression = valid_character
31+
stripped = []; for c in string.split(//)
32+
stripped << c if valid_character.match(c)
33+
end; stripped.join
34+
else
35+
string
36+
end
37+
end
38+
end
39+
end
Lines changed: 106 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,106 @@
1+
require 'abstract_unit'
2+
3+
class MultibyteUtilsTest < Test::Unit::TestCase
4+
5+
def test_valid_character_returns_an_expression_for_the_current_encoding
6+
with_kcode('None') do
7+
assert_nil ActiveSupport::Multibyte.valid_character
8+
end
9+
with_kcode('UTF8') do
10+
assert_equal ActiveSupport::Multibyte::VALID_CHARACTER['UTF-8'], ActiveSupport::Multibyte.valid_character
11+
end
12+
with_kcode('SJIS') do
13+
assert_equal ActiveSupport::Multibyte::VALID_CHARACTER['Shift_JIS'], ActiveSupport::Multibyte.valid_character
14+
end
15+
end
16+
17+
def test_verify_verifies_ASCII_strings_are_properly_encoded
18+
with_kcode('None') do
19+
examples.each do |example|
20+
assert ActiveSupport::Multibyte.verify(example)
21+
end
22+
end
23+
end
24+
25+
def test_verify_verifies_UTF_8_strings_are_properly_encoded
26+
with_kcode('UTF8') do
27+
assert ActiveSupport::Multibyte.verify(example('valid UTF-8'))
28+
assert !ActiveSupport::Multibyte.verify(example('invalid UTF-8'))
29+
end
30+
end
31+
32+
def test_verify_verifies_Shift_JIS_strings_are_properly_encoded
33+
with_kcode('SJIS') do
34+
assert ActiveSupport::Multibyte.verify(example('valid Shift-JIS'))
35+
assert !ActiveSupport::Multibyte.verify(example('invalid Shift-JIS'))
36+
end
37+
end
38+
39+
def test_verify_bang_raises_an_exception_when_it_finds_an_invalid_character
40+
with_kcode('UTF8') do
41+
assert_raises(ActiveSupport::Multibyte::Handlers::EncodingError) do
42+
ActiveSupport::Multibyte.verify!(example('invalid UTF-8'))
43+
end
44+
end
45+
end
46+
47+
def test_verify_bang_doesnt_raise_an_exception_when_the_encoding_is_valid
48+
with_kcode('UTF8') do
49+
assert_nothing_raised do
50+
ActiveSupport::Multibyte.verify!(example('valid UTF-8'))
51+
end
52+
end
53+
end
54+
55+
def test_clean_leaves_ASCII_strings_intact
56+
with_kcode('None') do
57+
[
58+
'word', "\270\236\010\210\245"
59+
].each do |string|
60+
assert_equal string, ActiveSupport::Multibyte.clean(string)
61+
end
62+
end
63+
end
64+
65+
def test_clean_cleans_invalid_characters_from_UTF_8_encoded_strings
66+
with_kcode('UTF8') do
67+
cleaned_utf8 = [8].pack('C*')
68+
assert_equal example('valid UTF-8'), ActiveSupport::Multibyte.clean(example('valid UTF-8'))
69+
assert_equal cleaned_utf8, ActiveSupport::Multibyte.clean(example('invalid UTF-8'))
70+
end
71+
end
72+
73+
def test_clean_cleans_invalid_characters_from_Shift_JIS_encoded_strings
74+
with_kcode('SJIS') do
75+
cleaned_sjis = [184, 0, 136, 165].pack('C*')
76+
assert_equal example('valid Shift-JIS'), ActiveSupport::Multibyte.clean(example('valid Shift-JIS'))
77+
assert_equal cleaned_sjis, ActiveSupport::Multibyte.clean(example('invalid Shift-JIS'))
78+
end
79+
end
80+
81+
private
82+
83+
STRINGS = {
84+
'valid ASCII' => [65, 83, 67, 73, 73].pack('C*'),
85+
'invalid ASCII' => [128].pack('C*'),
86+
'valid UTF-8' => [227, 129, 147, 227, 129, 171, 227, 129, 161, 227, 130, 143].pack('C*'),
87+
'invalid UTF-8' => [184, 158, 8, 136, 165].pack('C*'),
88+
'valid Shift-JIS' => [131, 122, 129, 91, 131, 128].pack('C*'),
89+
'invalid Shift-JIS' => [184, 158, 8, 0, 255, 136, 165].pack('C*')
90+
}
91+
92+
def example(key)
93+
STRINGS[key]
94+
end
95+
96+
def examples
97+
STRINGS.values
98+
end
99+
100+
def with_kcode(code)
101+
before = $KCODE
102+
$KCODE = code
103+
yield
104+
$KCODE = before
105+
end
106+
end

0 commit comments

Comments
 (0)