diff options
| author | Louise Crow <louise.crow@gmail.com> | 2015-05-13 08:43:32 +0100 | 
|---|---|---|
| committer | Louise Crow <louise.crow@gmail.com> | 2015-06-23 10:09:27 +0100 | 
| commit | 048a172c2f3157fa2f2dd0b1f66026ca630b018f (patch) | |
| tree | 469b1df904999a0ec4c7c9555a38cf438931036c | |
| parent | ac59a76cc380af792a09e5dd82b8402dc966af42 (diff) | |
Add script to detect poorly encoded UTF-8 in existing data.
| -rw-r--r-- | lib/tasks/temp.rake | 96 | 
1 files changed, 96 insertions, 0 deletions
| diff --git a/lib/tasks/temp.rake b/lib/tasks/temp.rake index 67fa10174..d5f7e8b22 100644 --- a/lib/tasks/temp.rake +++ b/lib/tasks/temp.rake @@ -1,3 +1,4 @@ +# -*- coding: utf-8 -*-  namespace :temp do @@ -37,4 +38,99 @@ namespace :temp do      end +    desc 'Look for and fix invalid UTF-8 text in various models. Should be run under ruby 1.9 or above' +    task :fix_invalid_utf8 => :environment do + +        dryrun = ENV['DRYRUN'] != '0' +        if dryrun +            $stderr.puts "This is a dryrun - nothing will be changed" +        end + + +        PublicBody.find_each do |public_body| +            unless public_body.name.valid_encoding? +                name = convert_string_to_utf8(public_body.name) +                puts "Bad encoding in PublicBody name, id: #{public_body.id}, " \ +                "old name: #{public_body.name.force_encoding('UTF-8')}, new name #{name}" +                unless dryrun +                    public_body.name_will_change! +                    public_body.name = name +                    public_body.last_edit_editor = 'system' +                    public_body.last_edit_comment = 'Invalid utf-8 encoding fixed by temp:fix_invalid_utf8' +                    public_body.save! +                end +            end + +            # Editing old versions of public bodies - we don't want to affect the timestamp +            PublicBody::Version.record_timestamps = false +            public_body.versions.each do |public_body_version| +                unless public_body_version.name.valid_encoding? +                    name = convert_string_to_utf8(public_body_version.name).string +                    puts "Bad encoding in PublicBody::Version name, " \ +                    "id: #{public_body_version.id}, old name: #{public_body_version.name.force_encoding('UTF-8')}, " \ +                    "new name: #{name}" +                    unless dryrun +                        public_body_version.name_will_change! +                        public_body_version.name = name +                        public_body_version.save! +                    end +                end +            end +            PublicBody::Version.record_timestamps = true + +        end + +        IncomingMessage.find_each do |incoming_message| +            if (incoming_message.cached_attachment_text_clipped && +                !incoming_message.cached_attachment_text_clipped.valid_encoding?) || +               (incoming_message.cached_main_body_text_folded && +                !incoming_message.cached_main_body_text_folded.valid_encoding?) || +               (incoming_message.cached_main_body_text_unfolded && +                !incoming_message.cached_main_body_text_unfolded.valid_encoding?) +                puts "Bad encoding in IncomingMessage cached fields, :id #{incoming_message.id} " +                unless dryrun +                    incoming_message.clear_in_database_caches! +                end +            end +        end + +        FoiAttachment.find_each do |foi_attachment| +            unescaped_filename = CGI.unescape(foi_attachment.filename) +            unless unescaped_filename.valid_encoding? +                filename = convert_string_to_utf8(unescaped_filename).string +                puts "Bad encoding in FoiAttachment filename, id: #{foi_attachment.id} " \ +                "old filename #{unescaped_filename.force_encoding('UTF-8')}, new filename #{filename}" +                unless dryrun +                    foi_attachment.filename = filename +                    foi_attachment.save! +                end +            end +        end + +        OutgoingMessage.find_each do |outgoing_message| +            unless outgoing_message.raw_body.valid_encoding? + +                raw_body = convert_string_to_utf8(outgoing_message.raw_body).string +                puts "Bad encoding in OutgoingMessage raw_body, id: #{outgoing_message.id} " \ +                "old raw_body: #{outgoing_message.raw_body.force_encoding('UTF-8')}, new raw_body: #{raw_body}" +                unless dryrun +                    outgoing_message.body = raw_body +                    outgoing_message.save! +                end +            end +        end + +        User.find_each do |user| +            unless user.name.valid_encoding? +                name = convert_string_to_utf8(user.name).string +                puts "Bad encoding in User name, id: #{user.id}, " \ +                "old name: #{user.name.force_encoding('UTF-8')}, new name: #{name}" +                unless dryrun +                    user.name = name +                    user.save! +                end +            end +        end + +    end  end | 
