aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLouise Crow <louise.crow@gmail.com>2015-06-09 16:33:03 +0100
committerLouise Crow <louise.crow@gmail.com>2015-06-22 17:43:19 +0100
commit4440d11fb662c57428a2aba622209d6d1ddc0a59 (patch)
tree320dc9c9d02c654bfc2e6991ffc761be0e8c7157
parent910acfa8ae939f363a872123eb47a86e64a192c3 (diff)
Round trip through utf-16 to clean utf-8 string
As noted in the ruby docs (http://ruby-doc.org/core-1.9.3/String.html#method-i-encode), any conversion from an encoding to the same encoding is a no-op, covert it first to utf-16.
-rw-r--r--lib/normalize_string.rb16
-rw-r--r--spec/lib/basic_encoding_spec.rb5
2 files changed, 12 insertions, 9 deletions
diff --git a/lib/normalize_string.rb b/lib/normalize_string.rb
index d850d7e05..8b54c080c 100644
--- a/lib/normalize_string.rb
+++ b/lib/normalize_string.rb
@@ -77,16 +77,20 @@ def convert_string_to_utf8(s, suggested_character_encoding=nil)
begin
result = normalize_string_to_utf8 s, suggested_character_encoding
rescue EncodingNormalizationError
- result = s
- if String.method_defined?(:encode)
- result = s.force_encoding("utf-8").encode("utf-8", :invalid => :replace,
- :undef => :replace,
- :replace => "")
- end
+ result = scrub(s)
end
result
end
+def scrub(string)
+ if String.method_defined?(:encode)
+ string = string.force_encoding("utf-8")
+ string.valid_encoding? ? string : string.encode("utf-16le", :invalid => :replace, :replace => "").encode("utf-8")
+ else
+ Iconv.conv('UTF-8//IGNORE', 'UTF-8', string)
+ end
+end
+
def log_text_details(message, text)
if String.method_defined?(:encode)
STDERR.puts "#{message}, we have text: #{text}, of class #{text.class} and encoding #{text.encoding}"
diff --git a/spec/lib/basic_encoding_spec.rb b/spec/lib/basic_encoding_spec.rb
index d77465ad8..d802da892 100644
--- a/spec/lib/basic_encoding_spec.rb
+++ b/spec/lib/basic_encoding_spec.rb
@@ -160,17 +160,16 @@ describe "convert_string_to_utf8" do
describe "when passed uninterpretable character data" do
- it "should return it as a utf8 string" do
+ it "should return it as a valid utf8 string with non-utf8 characters removed" do
converted = convert_string_to_utf8 random_string
- converted.should == random_string
if String.method_defined?(:encode)
converted.encoding.to_s.should == 'UTF-8'
+ converted.valid_encoding?.should == true
end
converted = convert_string_to_utf8 random_string,'UTF-8'
- converted.should == random_string
if String.method_defined?(:encode)
converted.encoding.to_s.should == 'UTF-8'