[1329] | 1 | diff --git a/activesupport/lib/active_support/multibyte.rb b/activesupport/lib/active_support/multibyte.rb |
---|
| 2 | index 65a96af..b6354ee 100644 |
---|
| 3 | --- a/activesupport/lib/active_support/multibyte.rb |
---|
| 4 | +++ b/activesupport/lib/active_support/multibyte.rb |
---|
| 5 | @@ -1,9 +1,5 @@ |
---|
| 6 | # encoding: utf-8 |
---|
| 7 | |
---|
| 8 | -require 'active_support/multibyte/chars' |
---|
| 9 | -require 'active_support/multibyte/exceptions' |
---|
| 10 | -require 'active_support/multibyte/unicode_database' |
---|
| 11 | - |
---|
| 12 | module ActiveSupport #:nodoc: |
---|
| 13 | module Multibyte |
---|
| 14 | # A list of all available normalization forms. See http://www.unicode.org/reports/tr15/tr15-29.html for more |
---|
| 15 | @@ -27,7 +23,35 @@ module ActiveSupport #:nodoc: |
---|
| 16 | # |
---|
| 17 | # Example: |
---|
| 18 | # ActiveSupport::Multibyte.proxy_class = CharsForUTF32 |
---|
| 19 | - mattr_accessor :proxy_class |
---|
| 20 | - self.proxy_class = ActiveSupport::Multibyte::Chars |
---|
| 21 | + def self.proxy_class=(klass) |
---|
| 22 | + @proxy_class = klass |
---|
| 23 | + end |
---|
| 24 | + |
---|
| 25 | + # Returns the currect proxy class |
---|
| 26 | + def self.proxy_class |
---|
| 27 | + @proxy_class ||= ActiveSupport::Multibyte::Chars |
---|
| 28 | + end |
---|
| 29 | + |
---|
| 30 | + # Regular expressions that describe valid byte sequences for a character |
---|
| 31 | + VALID_CHARACTER = { |
---|
| 32 | + # Borrowed from the Kconv library by Shinji KONO - (also as seen on the W3C site) |
---|
| 33 | + 'UTF-8' => /\A(?: |
---|
| 34 | + [\x00-\x7f] | |
---|
| 35 | + [\xc2-\xdf] [\x80-\xbf] | |
---|
| 36 | + \xe0 [\xa0-\xbf] [\x80-\xbf] | |
---|
| 37 | + [\xe1-\xef] [\x80-\xbf] [\x80-\xbf] | |
---|
| 38 | + \xf0 [\x90-\xbf] [\x80-\xbf] [\x80-\xbf] | |
---|
| 39 | + [\xf1-\xf3] [\x80-\xbf] [\x80-\xbf] [\x80-\xbf] | |
---|
| 40 | + \xf4 [\x80-\x8f] [\x80-\xbf] [\x80-\xbf])\z /xn, |
---|
| 41 | + # Quick check for valid Shift-JIS characters, disregards the odd-even pairing |
---|
| 42 | + 'Shift_JIS' => /\A(?: |
---|
| 43 | + [\x00-\x7e \xa1-\xdf] | |
---|
| 44 | + [\x81-\x9f \xe0-\xef] [\x40-\x7e \x80-\x9e \x9f-\xfc])\z /xn |
---|
| 45 | + } |
---|
| 46 | end |
---|
| 47 | end |
---|
| 48 | + |
---|
| 49 | +require 'active_support/multibyte/chars' |
---|
| 50 | +require 'active_support/multibyte/exceptions' |
---|
| 51 | +require 'active_support/multibyte/unicode_database' |
---|
| 52 | +require 'active_support/multibyte/utils' |
---|
| 53 | diff --git a/activesupport/lib/active_support/multibyte/chars.rb b/activesupport/lib/active_support/multibyte/chars.rb |
---|
| 54 | index 3d392d2..16bc130 100644 |
---|
| 55 | --- a/activesupport/lib/active_support/multibyte/chars.rb |
---|
| 56 | +++ b/activesupport/lib/active_support/multibyte/chars.rb |
---|
| 57 | @@ -73,16 +73,7 @@ module ActiveSupport #:nodoc: |
---|
| 58 | UNICODE_TRAILERS_PAT = /(#{codepoints_to_pattern(UNICODE_LEADERS_AND_TRAILERS)})+\Z/ |
---|
| 59 | UNICODE_LEADERS_PAT = /\A(#{codepoints_to_pattern(UNICODE_LEADERS_AND_TRAILERS)})+/ |
---|
| 60 | |
---|
| 61 | - # Borrowed from the Kconv library by Shinji KONO - (also as seen on the W3C site) |
---|
| 62 | - UTF8_PAT = /\A(?: |
---|
| 63 | - [\x00-\x7f] | |
---|
| 64 | - [\xc2-\xdf] [\x80-\xbf] | |
---|
| 65 | - \xe0 [\xa0-\xbf] [\x80-\xbf] | |
---|
| 66 | - [\xe1-\xef] [\x80-\xbf] [\x80-\xbf] | |
---|
| 67 | - \xf0 [\x90-\xbf] [\x80-\xbf] [\x80-\xbf] | |
---|
| 68 | - [\xf1-\xf3] [\x80-\xbf] [\x80-\xbf] [\x80-\xbf] | |
---|
| 69 | - \xf4 [\x80-\x8f] [\x80-\xbf] [\x80-\xbf] |
---|
| 70 | - )*\z/xn |
---|
| 71 | + UTF8_PAT = ActiveSupport::Multibyte::VALID_CHARACTER['UTF-8'] |
---|
| 72 | |
---|
| 73 | attr_reader :wrapped_string |
---|
| 74 | alias to_s wrapped_string |
---|
| 75 | @@ -307,23 +298,23 @@ module ActiveSupport #:nodoc: |
---|
| 76 | def rstrip |
---|
| 77 | chars(@wrapped_string.gsub(UNICODE_TRAILERS_PAT, '')) |
---|
| 78 | end |
---|
| 79 | - |
---|
| 80 | + |
---|
| 81 | # Strips entire range of Unicode whitespace from the left of the string. |
---|
| 82 | def lstrip |
---|
| 83 | chars(@wrapped_string.gsub(UNICODE_LEADERS_PAT, '')) |
---|
| 84 | end |
---|
| 85 | - |
---|
| 86 | + |
---|
| 87 | # Strips entire range of Unicode whitespace from the right and left of the string. |
---|
| 88 | def strip |
---|
| 89 | rstrip.lstrip |
---|
| 90 | end |
---|
| 91 | - |
---|
| 92 | + |
---|
| 93 | # Returns the number of codepoints in the string |
---|
| 94 | def size |
---|
| 95 | self.class.u_unpack(@wrapped_string).size |
---|
| 96 | end |
---|
| 97 | alias_method :length, :size |
---|
| 98 | - |
---|
| 99 | + |
---|
| 100 | # Reverses all characters in the string. |
---|
| 101 | # |
---|
| 102 | # Example: |
---|
| 103 | @@ -331,7 +322,7 @@ module ActiveSupport #:nodoc: |
---|
| 104 | def reverse |
---|
| 105 | chars(self.class.u_unpack(@wrapped_string).reverse.pack('U*')) |
---|
| 106 | end |
---|
| 107 | - |
---|
| 108 | + |
---|
| 109 | # Implements Unicode-aware slice with codepoints. Slicing on one point returns the codepoints for that |
---|
| 110 | # character. |
---|
| 111 | # |
---|
| 112 | @@ -646,7 +637,7 @@ module ActiveSupport #:nodoc: |
---|
| 113 | string.split(//u).map do |c| |
---|
| 114 | c.force_encoding(Encoding::ASCII) if c.respond_to?(:force_encoding) |
---|
| 115 | |
---|
| 116 | - if !UTF8_PAT.match(c) |
---|
| 117 | + if !ActiveSupport::Multibyte::VALID_CHARACTER['UTF-8'].match(c) |
---|
| 118 | n = c.unpack('C')[0] |
---|
| 119 | n < 128 ? n.chr : |
---|
| 120 | n < 160 ? [UCD.cp1252[n] || n].pack('U') : |
---|
| 121 | diff --git a/activesupport/lib/active_support/multibyte/utils.rb b/activesupport/lib/active_support/multibyte/utils.rb |
---|
| 122 | new file mode 100644 |
---|
| 123 | index 0000000..acef84d |
---|
| 124 | --- /dev/null |
---|
| 125 | +++ b/activesupport/lib/active_support/multibyte/utils.rb |
---|
| 126 | @@ -0,0 +1,61 @@ |
---|
| 127 | +# encoding: utf-8 |
---|
| 128 | + |
---|
| 129 | +module ActiveSupport #:nodoc: |
---|
| 130 | + module Multibyte #:nodoc: |
---|
| 131 | + if Kernel.const_defined?(:Encoding) |
---|
| 132 | + # Returns a regular expression that matches valid characters in the current encoding |
---|
| 133 | + def self.valid_character |
---|
| 134 | + VALID_CHARACTER[Encoding.default_internal.to_s] |
---|
| 135 | + end |
---|
| 136 | + else |
---|
| 137 | + def self.valid_character |
---|
| 138 | + case $KCODE |
---|
| 139 | + when 'UTF8' |
---|
| 140 | + VALID_CHARACTER['UTF-8'] |
---|
| 141 | + when 'SJIS' |
---|
| 142 | + VALID_CHARACTER['Shift_JIS'] |
---|
| 143 | + end |
---|
| 144 | + end |
---|
| 145 | + end |
---|
| 146 | + |
---|
| 147 | + if 'string'.respond_to?(:valid_encoding?) |
---|
| 148 | + # Verifies the encoding of a string |
---|
| 149 | + def self.verify(string) |
---|
| 150 | + string.valid_encoding? |
---|
| 151 | + end |
---|
| 152 | + else |
---|
| 153 | + def self.verify(string) |
---|
| 154 | + if expression = valid_character |
---|
| 155 | + for c in string.split(//) |
---|
| 156 | + return false unless valid_character.match(c) |
---|
| 157 | + end |
---|
| 158 | + end |
---|
| 159 | + true |
---|
| 160 | + end |
---|
| 161 | + end |
---|
| 162 | + |
---|
| 163 | + # Verifies the encoding of the string and raises an exception when it's not valid |
---|
| 164 | + def self.verify!(string) |
---|
| 165 | + raise EncodingError.new("Found characters with invalid encoding") unless verify(string) |
---|
| 166 | + end |
---|
| 167 | + |
---|
| 168 | + if 'string'.respond_to?(:force_encoding) |
---|
| 169 | + # Removes all invalid characters from the string. |
---|
| 170 | + # |
---|
| 171 | + # Note: this method is a no-op in Ruby 1.9 |
---|
| 172 | + def self.clean(string) |
---|
| 173 | + string |
---|
| 174 | + end |
---|
| 175 | + else |
---|
| 176 | + def self.clean(string) |
---|
| 177 | + if expression = valid_character |
---|
| 178 | + stripped = []; for c in string.split(//) |
---|
| 179 | + stripped << c if valid_character.match(c) |
---|
| 180 | + end; stripped.join |
---|
| 181 | + else |
---|
| 182 | + string |
---|
| 183 | + end |
---|
| 184 | + end |
---|
| 185 | + end |
---|
| 186 | + end |
---|
| 187 | +end |
---|
| 188 | \ No newline at end of file |
---|
| 189 | diff --git a/activesupport/test/multibyte_utils_test.rb b/activesupport/test/multibyte_utils_test.rb |
---|
| 190 | new file mode 100644 |
---|
| 191 | index 0000000..d8ac5ff |
---|
| 192 | --- /dev/null |
---|
| 193 | +++ b/activesupport/test/multibyte_utils_test.rb |
---|
| 194 | @@ -0,0 +1,141 @@ |
---|
| 195 | +# encoding: utf-8 |
---|
| 196 | + |
---|
| 197 | +require 'abstract_unit' |
---|
| 198 | +require 'multibyte_test_helpers' |
---|
| 199 | + |
---|
| 200 | +class MultibyteUtilsTest < ActiveSupport::TestCase |
---|
| 201 | + include MultibyteTestHelpers |
---|
| 202 | + |
---|
| 203 | + test "valid_character returns an expression for the current encoding" do |
---|
| 204 | + with_encoding('None') do |
---|
| 205 | + assert_nil ActiveSupport::Multibyte.valid_character |
---|
| 206 | + end |
---|
| 207 | + with_encoding('UTF8') do |
---|
| 208 | + assert_equal ActiveSupport::Multibyte::VALID_CHARACTER['UTF-8'], ActiveSupport::Multibyte.valid_character |
---|
| 209 | + end |
---|
| 210 | + with_encoding('SJIS') do |
---|
| 211 | + assert_equal ActiveSupport::Multibyte::VALID_CHARACTER['Shift_JIS'], ActiveSupport::Multibyte.valid_character |
---|
| 212 | + end |
---|
| 213 | + end |
---|
| 214 | + |
---|
| 215 | + test "verify verifies ASCII strings are properly encoded" do |
---|
| 216 | + with_encoding('None') do |
---|
| 217 | + examples.each do |example| |
---|
| 218 | + assert ActiveSupport::Multibyte.verify(example) |
---|
| 219 | + end |
---|
| 220 | + end |
---|
| 221 | + end |
---|
| 222 | + |
---|
| 223 | + test "verify verifies UTF-8 strings are properly encoded" do |
---|
| 224 | + with_encoding('UTF8') do |
---|
| 225 | + assert ActiveSupport::Multibyte.verify(example('valid UTF-8')) |
---|
| 226 | + assert !ActiveSupport::Multibyte.verify(example('invalid UTF-8')) |
---|
| 227 | + end |
---|
| 228 | + end |
---|
| 229 | + |
---|
| 230 | + test "verify verifies Shift-JIS strings are properly encoded" do |
---|
| 231 | + with_encoding('SJIS') do |
---|
| 232 | + assert ActiveSupport::Multibyte.verify(example('valid Shift-JIS')) |
---|
| 233 | + assert !ActiveSupport::Multibyte.verify(example('invalid Shift-JIS')) |
---|
| 234 | + end |
---|
| 235 | + end |
---|
| 236 | + |
---|
| 237 | + test "verify! raises an exception when it finds an invalid character" do |
---|
| 238 | + with_encoding('UTF8') do |
---|
| 239 | + assert_raises(ActiveSupport::Multibyte::EncodingError) do |
---|
| 240 | + ActiveSupport::Multibyte.verify!(example('invalid UTF-8')) |
---|
| 241 | + end |
---|
| 242 | + end |
---|
| 243 | + end |
---|
| 244 | + |
---|
| 245 | + test "verify! doesn't raise an exception when the encoding is valid" do |
---|
| 246 | + with_encoding('UTF8') do |
---|
| 247 | + assert_nothing_raised do |
---|
| 248 | + ActiveSupport::Multibyte.verify!(example('valid UTF-8')) |
---|
| 249 | + end |
---|
| 250 | + end |
---|
| 251 | + end |
---|
| 252 | + |
---|
| 253 | + if RUBY_VERSION < '1.9' |
---|
| 254 | + test "clean leaves ASCII strings intact" do |
---|
| 255 | + with_encoding('None') do |
---|
| 256 | + [ |
---|
| 257 | + 'word', "\270\236\010\210\245" |
---|
| 258 | + ].each do |string| |
---|
| 259 | + assert_equal string, ActiveSupport::Multibyte.clean(string) |
---|
| 260 | + end |
---|
| 261 | + end |
---|
| 262 | + end |
---|
| 263 | + |
---|
| 264 | + test "clean cleans invalid characters from UTF-8 encoded strings" do |
---|
| 265 | + with_encoding('UTF8') do |
---|
| 266 | + cleaned_utf8 = [8].pack('C*') |
---|
| 267 | + assert_equal example('valid UTF-8'), ActiveSupport::Multibyte.clean(example('valid UTF-8')) |
---|
| 268 | + assert_equal cleaned_utf8, ActiveSupport::Multibyte.clean(example('invalid UTF-8')) |
---|
| 269 | + end |
---|
| 270 | + end |
---|
| 271 | + |
---|
| 272 | + test "clean cleans invalid characters from Shift-JIS encoded strings" do |
---|
| 273 | + with_encoding('SJIS') do |
---|
| 274 | + cleaned_sjis = [184, 0, 136, 165].pack('C*') |
---|
| 275 | + assert_equal example('valid Shift-JIS'), ActiveSupport::Multibyte.clean(example('valid Shift-JIS')) |
---|
| 276 | + assert_equal cleaned_sjis, ActiveSupport::Multibyte.clean(example('invalid Shift-JIS')) |
---|
| 277 | + end |
---|
| 278 | + end |
---|
| 279 | + else |
---|
| 280 | + test "clean is a no-op" do |
---|
| 281 | + with_encoding('UTF8') do |
---|
| 282 | + assert_equal example('invalid Shift-JIS'), ActiveSupport::Multibyte.clean(example('invalid Shift-JIS')) |
---|
| 283 | + end |
---|
| 284 | + end |
---|
| 285 | + end |
---|
| 286 | + |
---|
| 287 | + private |
---|
| 288 | + |
---|
| 289 | + STRINGS = { |
---|
| 290 | + 'valid ASCII' => [65, 83, 67, 73, 73].pack('C*'), |
---|
| 291 | + 'invalid ASCII' => [128].pack('C*'), |
---|
| 292 | + 'valid UTF-8' => [227, 129, 147, 227, 129, 171, 227, 129, 161, 227, 130, 143].pack('C*'), |
---|
| 293 | + 'invalid UTF-8' => [184, 158, 8, 136, 165].pack('C*'), |
---|
| 294 | + 'valid Shift-JIS' => [131, 122, 129, 91, 131, 128].pack('C*'), |
---|
| 295 | + 'invalid Shift-JIS' => [184, 158, 8, 0, 255, 136, 165].pack('C*') |
---|
| 296 | + } |
---|
| 297 | + |
---|
| 298 | + if Kernel.const_defined?(:Encoding) |
---|
| 299 | + def example(key) |
---|
| 300 | + STRINGS[key].force_encoding(Encoding.default_internal) |
---|
| 301 | + end |
---|
| 302 | + |
---|
| 303 | + def examples |
---|
| 304 | + STRINGS.values.map { |s| s.force_encoding(Encoding.default_internal) } |
---|
| 305 | + end |
---|
| 306 | + else |
---|
| 307 | + def example(key) |
---|
| 308 | + STRINGS[key] |
---|
| 309 | + end |
---|
| 310 | + |
---|
| 311 | + def examples |
---|
| 312 | + STRINGS.values |
---|
| 313 | + end |
---|
| 314 | + end |
---|
| 315 | + |
---|
| 316 | + if 'string'.respond_to?(:encoding) |
---|
| 317 | + def with_encoding(enc) |
---|
| 318 | + before = Encoding.default_internal |
---|
| 319 | + |
---|
| 320 | + case enc |
---|
| 321 | + when 'UTF8' |
---|
| 322 | + Encoding.default_internal = Encoding::UTF_8 |
---|
| 323 | + when 'SJIS' |
---|
| 324 | + Encoding.default_internal = Encoding::Shift_JIS |
---|
| 325 | + else |
---|
| 326 | + Encoding.default_internal = Encoding::BINARY |
---|
| 327 | + end |
---|
| 328 | + yield |
---|
| 329 | + |
---|
| 330 | + Encoding.default_internal = before |
---|
| 331 | + end |
---|
| 332 | + else |
---|
| 333 | + alias with_encoding with_kcode |
---|
| 334 | + end |
---|
| 335 | +end |
---|
| 336 | \ No newline at end of file |
---|
| 337 | |
---|