diff options
| author | Caleb <caleb.tutty@nzherald.co.nz> | 2015-04-20 23:02:46 +1200 | 
|---|---|---|
| committer | Louise Crow <louise.crow@gmail.com> | 2015-06-04 12:20:02 +0100 | 
| commit | 592434c4b2b8e58416b3cdb3a66513ac206576d1 (patch) | |
| tree | 2a05a48a0ab03cedd233aded6e9870f710b862a3 | |
| parent | 772e05925b1a4f6e3769254c33b2bf9114651bc6 (diff) | |
Fix encoding issue with searching for macrons
Correct indentation to be consistent
| -rw-r--r-- | lib/acts_as_xapian/acts_as_xapian.rb | 72 | ||||
| -rw-r--r-- | spec/integration/xapian_search_highlighting_spec.rb | 11 | 
2 files changed, 48 insertions, 35 deletions
| diff --git a/lib/acts_as_xapian/acts_as_xapian.rb b/lib/acts_as_xapian/acts_as_xapian.rb index 6520a20a4..f742bae52 100644 --- a/lib/acts_as_xapian/acts_as_xapian.rb +++ b/lib/acts_as_xapian/acts_as_xapian.rb @@ -487,41 +487,37 @@ module ActsAsXapian          # date ranges or similar. Use this for cheap highlighting with          # TextHelper::highlight, and excerpt.          def words_to_highlight(opts = {}) -          default_opts = { :include_original => false, :regex => false } -          opts = default_opts.merge(opts) - -          # Reject all prefixes other than Z, which we know is reserved for stems -          terms = query.terms.reject { |t| t.term.first.match(/^[A-Y]$/) } -          # Collect the stems including the Z prefix -          raw_stems = terms.map { |t| t.term if t.term.start_with?('Z') }.compact.uniq.sort -          # Collect stems, chopping the Z prefix off -          stems = raw_stems.map { |t| t[1..-1] }.compact.sort -          # Collect the non-stem terms -          words = terms.map { |t| t.term unless t.term.start_with?('Z') }.compact.sort - -          # Add the unstemmed words from the original query -          # Sometimes stems can be unhelpful with the :regex option, for example -          # stemming 'boring' results in us trying to highlight 'bore'. -          if opts[:include_original] -            raw_stems.each do |raw_stem| -              words << ActsAsXapian.query_parser.unstem(raw_stem).uniq -            end - -            words = words.any? ? words.flatten.uniq : [] -          end - -          if opts[:regex] -            stems.map! { |w| /\b(#{ w })\w*\b/iu } -            words.map! { |w| /\b(#{ w })\b/iu } -          end - -          if RUBY_VERSION.to_f >= 1.9 -              (stems + words).map! do |term| -                  term.is_a?(String) ? term.force_encoding('UTF-8') : term -              end -          else -              stems + words -          end +            default_opts = { :include_original => false, :regex => false } +            opts = default_opts.merge(opts) + +            # Reject all prefixes other than Z, which we know is reserved for stems +            terms = query.terms.reject { |t| t.term.first.match(/^[A-Y]$/) } +            # Collect the stems including the Z prefix +            raw_stems = terms.map { |t| t.term if t.term.start_with?('Z') }.compact.uniq.sort +            # Collect stems, chopping the Z prefix off +            stems = raw_stems.map { |t| t[1..-1] }.compact.sort +            # Collect the non-stem terms +            words = terms.map { |t| t.term unless t.term.start_with?('Z') }.compact.sort + +            # Add the unstemmed words from the original query +            # Sometimes stems can be unhelpful with the :regex option, for example +            # stemming 'boring' results in us trying to highlight 'bore'. +            if opts[:include_original] +                raw_stems.each do |raw_stem| +                    words << ActsAsXapian.query_parser.unstem(raw_stem).uniq +                end + +                words = words.any? ? words.flatten.uniq : [] +            end + +            if opts[:regex] +                stems.map! { |w| /\b(#{ correctly_encode(w) })\w*\b/iu } +                words.map! { |w| /\b(#{ correctly_encode(w) })\b/iu } +            end + +            (stems + words).map! do |term| +                term.is_a?(String) ? correctly_encode(term) : term +            end          end          # Text for lines in log file @@ -529,6 +525,12 @@ module ActsAsXapian              "Search: " + self.query_string          end +        private + +        def correctly_encode(w) +            RUBY_VERSION.to_f >= 1.9 ? w.force_encoding('UTF-8') : w +        end +      end      # Search for models which contain theimportant terms taken from a specified diff --git a/spec/integration/xapian_search_highlighting_spec.rb b/spec/integration/xapian_search_highlighting_spec.rb index 65a34cf91..c0834a2c1 100644 --- a/spec/integration/xapian_search_highlighting_spec.rb +++ b/spec/integration/xapian_search_highlighting_spec.rb @@ -1,3 +1,5 @@ +# encoding: utf-8 +  require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')  describe 'highlighting search results' do @@ -36,4 +38,13 @@ describe 'highlighting search results' do          highlight_matches(phrase, matches).should == '<mark>boring</mark>'      end +    it 'handles macrons correctly' do +        phrase = 'Māori' + +        search = ActsAsXapian::Search.new([PublicBody], phrase, :limit => 1) +        matches = search.words_to_highlight(:regex => true, :include_original => true) + +        highlight_matches(phrase, matches).should == '<mark>Māori</mark>' +    end +  end | 
