discourse/lib/excerpt_parser.rb

class ExcerptParser < Nokogiri::XML::SAX::Document

  attr_reader :excerpt

  SPAN_REGEX = /<\s*span[^>]*class\s*=\s*['|"]excerpt['|"][^>]*>/

  def initialize(length, options = nil)
    @length = length
    @excerpt = ""
    @current_length = 0
    options || {}
    @strip_links = options[:strip_links] == true
    @strip_images = options[:strip_images] == true
    @text_entities = options[:text_entities] == true
    @markdown_images = options[:markdown_images] == true
    @keep_newlines = options[:keep_newlines] == true
    @keep_emoji_images = options[:keep_emoji_images] == true
    @keep_onebox_source = options[:keep_onebox_source] == true
    @remap_emoji = options[:remap_emoji] == true
    @start_excerpt = false
    @in_details_depth = 0
    @summary_contents = ""
    @detail_contents = ""
  end

  def self.get_excerpt(html, length, options)
    html ||= ''
    length = html.length if html.include?('excerpt') && SPAN_REGEX === html
    me = self.new(length, options)
    parser = Nokogiri::HTML::SAX::Parser.new(me)
    catch(:done) do
      parser.parse(html)
    end
    excerpt = me.excerpt.strip
    excerpt = excerpt.gsub(/\s*\n+\s*/, "\n\n") if options[:keep_onebox_source]
    excerpt = CGI.unescapeHTML(excerpt) if options[:text_entities] == true
    excerpt
  end

  def escape_attribute(v)
    return "" unless v

    v = v.dup
    v.gsub!("&", "&amp;")
    v.gsub!("\"", "&#34;")
    v.gsub!("<", "&lt;")
    v.gsub!(">", "&gt;")
    v
  end

  def include_tag(name, attributes)
    characters("<#{name} #{attributes.map { |k, v| "#{k}=\"#{escape_attribute(v)}\"" }.join(' ')}>", false, false, false)
  end

  def start_element(name, attributes = [])
    case name
    when "img"
      attributes = Hash[*attributes.flatten]

      if attributes["class"]&.include?('emoji')
        if @remap_emoji
          title = (attributes["alt"] || "").gsub(":", "")
          title = Emoji.lookup_unicode(title) || attributes["alt"]
          return characters(title)
        elsif @keep_emoji_images
          return include_tag(name, attributes)
        else
          return characters(attributes["alt"])
        end
      end

      unless @strip_images
        # If include_images is set, include the image in markdown
        characters("!") if @markdown_images

        if !attributes["alt"].blank?
          characters("[#{attributes["alt"]}]")
        elsif !attributes["title"].blank?
          characters("[#{attributes["title"]}]")
        else
          characters("[#{I18n.t 'excerpt_image'}]")
        end

        characters("(#{attributes['src']})") if @markdown_images
      end

    when "a"
      unless @strip_links
        include_tag(name, attributes)
        @in_a = true
      end

    when "aside"
      attributes = Hash[*attributes.flatten]
      unless @keep_onebox_source && attributes['class'].include?('onebox')
        @in_quote = true
      end

    when 'article'
      if @keep_onebox_source && attributes.include?(['class', 'onebox-body'])
        @in_quote = true
      end

    when "div", "span"
      if attributes.include?(["class", "excerpt"])
        @excerpt = ""
        @current_length = 0
        @start_excerpt = true
      end
      # Preserve spoilers
      if attributes.include?(["class", "spoiler"])
        include_tag("span", attributes)
        @in_spoiler = true
      end

    when "details"
      @detail_contents = "" if @in_details_depth == 0
      @in_details_depth += 1

    when "summary"
      if @in_details_depth == 1 && !@in_summary
        @summary_contents = ""
        @in_summary = true
      end

    end
  end

  def end_element(name)
    case name
    when "a"
      unless @strip_links
        characters("</a>", false, false, false)
        @in_a = false
      end
    when "p", "br"
      if @keep_newlines
        characters("<br>", false, false, false)
      else
        characters(" ")
      end
    when "aside"
      @in_quote = false
    when "details"
      @in_details_depth -= 1
      if @in_details_depth == 0
        full = "<details><summary>#{clean(@summary_contents)}</summary>#{clean(@detail_contents)}</details>"
        if @current_length + full.length > @length
          @excerpt << "<details class='disabled'><summary>#{@summary_contents[0..@length]}</summary></details>"
        else
          @excerpt << full
        end
      end
    when "summary"
      @in_summary = false if @in_details_depth == 1
    when "div", "span"
      throw :done if @start_excerpt
      characters("</span>", false, false, false) if @in_spoiler
      @in_spoiler = false
    end
  end

  def clean(str)
    ERB::Util.html_escape(str.strip)
  end

  def characters(string, truncate = true, count_it = true, encode = true)
    return if @in_quote

    # we call length on this so might as well ensure we have a string
    string = string.to_s
    if @in_details_depth > 0
      if @in_summary
        @summary_contents << string
      else
        @detail_contents << string
      end
      return
    end

    encode = encode ? lambda { |s| ERB::Util.html_escape(s) } : lambda { |s| s }
    if count_it && @current_length + string.length > @length
      length = [0, @length - @current_length - 1].max
      @excerpt << encode.call(string[0..length]) if truncate
      @excerpt << (@text_entities ? "..." : "&hellip;")
      @excerpt << "</a>" if @in_a
      throw :done
    end
    @excerpt << encode.call(string)
    @current_length += string.length if count_it
  end
end
refactor 2013-05-28 09:48:47 +10:00			`class ExcerptParser < Nokogiri::XML::SAX::Document`

			`attr_reader :excerpt`

FEATURE: Allow manual excerpt to be specified anywhere in the post and override max excerpt length 2014-09-03 22:03:12 -07:00			`SPAN_REGEX = /<\sspan[^>]class\s=\s['\|"]excerpt['\|"][^>]*>/`

Add rubocop to our build. (#5004) 2017-07-28 10:20:09 +09:00			`def initialize(length, options = nil)`
refactor 2013-05-28 09:48:47 +10:00			`@length = length`
			`@excerpt = ""`
			`@current_length = 0`
Better HTML emails, smarter email digests, new email section in admin with digest preview 2013-06-03 16:12:24 -04:00			`options \|\| {}`
refactor 2013-05-28 09:48:47 +10:00			`@strip_links = options[:strip_links] == true`
FEATURE: omit images from og and twitter description tags 2017-11-28 12:27:43 +01:00			`@strip_images = options[:strip_images] == true`
Better HTML emails, smarter email digests, new email section in admin with digest preview 2013-06-03 16:12:24 -04:00			`@text_entities = options[:text_entities] == true`
Allow images in the daily digest for top scoring posts 2013-06-05 18:54:46 -04:00			`@markdown_images = options[:markdown_images] == true`
FIX: clean html before sending it to jquery for collapsing 2015-05-20 14:42:54 +10:00			`@keep_newlines = options[:keep_newlines] == true`
FIX: Emoji in Discourse onebox is wrapped in square brackets. 2015-12-14 21:46:15 +08:00			`@keep_emoji_images = options[:keep_emoji_images] == true`
FEATURE: Add option for `ExcerptParser` to keep onebox source. 2017-04-10 16:11:58 +08:00			`@keep_onebox_source = options[:keep_onebox_source] == true`
FEATURE: remap emojis back for push notifications and desktop alerts 2016-10-11 13:03:21 +11:00			`@remap_emoji = options[:remap_emoji] == true`
Feature: allow mods to cut pinned topic excerpts 2014-07-17 21:32:17 +10:00			`@start_excerpt = false`
FIX: support for generating excerpt when nesting <details> blocks 2018-01-22 19:17:35 +01:00			`@in_details_depth = 0`
FIX: details tags broke excerpts 2017-12-19 17:28:55 -05:00			`@summary_contents = ""`
FIX: Don't disable details when below truncate limit 2017-12-20 15:44:36 -05:00			`@detail_contents = ""`
refactor 2013-05-28 09:48:47 +10:00			`end`

			`def self.get_excerpt(html, length, options)`
FEATURE: Allow manual excerpt to be specified anywhere in the post and override max excerpt length 2014-09-03 22:03:12 -07:00			`html \|\|= ''`
FIX: properly unescape HTML entities in excerpts 2014-12-10 12:52:51 +01:00			`length = html.length if html.include?('excerpt') && SPAN_REGEX === html`
FEATURE: Allow manual excerpt to be specified anywhere in the post and override max excerpt length 2014-09-03 22:03:12 -07:00			`me = self.new(length, options)`
refactor 2013-05-28 09:48:47 +10:00			`parser = Nokogiri::HTML::SAX::Parser.new(me)`
			`catch(:done) do`
FEATURE: Allow manual excerpt to be specified anywhere in the post and override max excerpt length 2014-09-03 22:03:12 -07:00			`parser.parse(html)`
refactor 2013-05-28 09:48:47 +10:00			`end`
FIX: properly unescape HTML entities in excerpts 2014-12-10 12:52:51 +01:00			`excerpt = me.excerpt.strip`
FEATURE: Add option for `ExcerptParser` to keep onebox source. 2017-04-10 16:11:58 +08:00			`excerpt = excerpt.gsub(/\s\n+\s/, "\n\n") if options[:keep_onebox_source]`
FIX: properly unescape HTML entities in excerpts 2014-12-10 12:52:51 +01:00			`excerpt = CGI.unescapeHTML(excerpt) if options[:text_entities] == true`
			`excerpt`
refactor 2013-05-28 09:48:47 +10:00			`end`

SECURITY: fix XSS in excerpt parser 2014-07-25 12:15:43 +10:00			`def escape_attribute(v)`
FIX: blank page on user page in rare cases 2014-10-07 11:37:27 +11:00			`return "" unless v`

			`v = v.dup`
			`v.gsub!("&", "&")`
			`v.gsub!("\"", """)`
			`v.gsub!("<", "<")`
			`v.gsub!(">", ">")`
			`v`
SECURITY: fix XSS in excerpt parser 2014-07-25 12:15:43 +10:00			`end`

Allow images in the daily digest for top scoring posts 2013-06-05 18:54:46 -04:00			`def include_tag(name, attributes)`
Add rubocop to our build. (#5004) 2017-07-28 10:20:09 +09:00			`characters("<#{name} #{attributes.map { \|k, v\| "#{k}=\"#{escape_attribute(v)}\"" }.join(' ')}>", false, false, false)`
Allow images in the daily digest for top scoring posts 2013-06-05 18:54:46 -04:00			`end`

Add rubocop to our build. (#5004) 2017-07-28 10:20:09 +09:00			`def start_element(name, attributes = [])`
refactor 2013-05-28 09:48:47 +10:00			`case name`
Add rubocop to our build. (#5004) 2017-07-28 10:20:09 +09:00			`when "img"`
			`attributes = Hash[*attributes.flatten]`
FEATURE: Display emojis in user stream. 2015-07-23 23:02:03 +08:00
FEATURE: omit images from og and twitter description tags 2017-11-28 12:27:43 +01:00			`if attributes["class"]&.include?('emoji')`
			`if @remap_emoji`
			`title = (attributes["alt"] \|\| "").gsub(":", "")`
			`title = Emoji.lookup_unicode(title) \|\| attributes["alt"]`
			`return characters(title)`
			`elsif @keep_emoji_images`
			`return include_tag(name, attributes)`
			`else`
			`return characters(attributes["alt"])`
FEATURE: Display emojis in user stream. 2015-07-23 23:02:03 +08:00			`end`
FEATURE: omit images from og and twitter description tags 2017-11-28 12:27:43 +01:00			`end`
FEATURE: Display emojis in user stream. 2015-07-23 23:02:03 +08:00
FEATURE: omit images from og and twitter description tags 2017-11-28 12:27:43 +01:00			`unless @strip_images`
Allow images in the daily digest for top scoring posts 2013-06-05 18:54:46 -04:00			`# If include_images is set, include the image in markdown`
			`characters("!") if @markdown_images`

FIX: Handle cases where `alt` and `title` tag is blank when parsing excerpt. 2017-04-11 12:12:51 +08:00			`if !attributes["alt"].blank?`
refactor 2013-05-28 09:48:47 +10:00			`characters("[#{attributes["alt"]}]")`
FIX: Handle cases where `alt` and `title` tag is blank when parsing excerpt. 2017-04-11 12:12:51 +08:00			`elsif !attributes["title"].blank?`
refactor 2013-05-28 09:48:47 +10:00			`characters("[#{attributes["title"]}]")`
			`else`
allow to translate image's exceprt 2015-01-23 17:57:01 +08:00			`characters("[#{I18n.t 'excerpt_image'}]")`
refactor 2013-05-28 09:48:47 +10:00			`end`
Allow images in the daily digest for top scoring posts 2013-06-05 18:54:46 -04:00
			`characters("(#{attributes['src']})") if @markdown_images`
FEATURE: omit images from og and twitter description tags 2017-11-28 12:27:43 +01:00			`end`
Allow images in the daily digest for top scoring posts 2013-06-05 18:54:46 -04:00
Add rubocop to our build. (#5004) 2017-07-28 10:20:09 +09:00			`when "a"`
			`unless @strip_links`
			`include_tag(name, attributes)`
			`@in_a = true`
			`end`
Preserve spoiler tags in post excerpts. 2014-02-20 14:18:30 +05:30
Add rubocop to our build. (#5004) 2017-07-28 10:20:09 +09:00			`when "aside"`
			`attributes = Hash[*attributes.flatten]`
FIX: support for generating excerpt when nesting <details> blocks 2018-01-22 19:17:35 +01:00			`unless @keep_onebox_source && attributes['class'].include?('onebox')`
			`@in_quote = true`
			`end`
FIX: Quotes should be ignored when parsing for onebox source. 2017-04-11 15:13:21 +08:00
Add rubocop to our build. (#5004) 2017-07-28 10:20:09 +09:00			`when 'article'`
			`if @keep_onebox_source && attributes.include?(['class', 'onebox-body'])`
			`@in_quote = true`
			`end`
FIX: support for generating excerpt when nesting <details> blocks 2018-01-22 19:17:35 +01:00
Add rubocop to our build. (#5004) 2017-07-28 10:20:09 +09:00			`when "div", "span"`
			`if attributes.include?(["class", "excerpt"])`
			`@excerpt = ""`
			`@current_length = 0`
			`@start_excerpt = true`
			`end`
FIX: support for generating excerpt when nesting <details> blocks 2018-01-22 19:17:35 +01:00			`# Preserve spoilers`
			`if attributes.include?(["class", "spoiler"])`
			`include_tag("span", attributes)`
			`@in_spoiler = true`
			`end`

FIX: details tags broke excerpts 2017-12-19 17:28:55 -05:00			`when "details"`
FIX: support for generating excerpt when nesting <details> blocks 2018-01-22 19:17:35 +01:00			`@detail_contents = "" if @in_details_depth == 0`
			`@in_details_depth += 1`

FIX: details tags broke excerpts 2017-12-19 17:28:55 -05:00			`when "summary"`
FIX: support for generating excerpt when nesting <details> blocks 2018-01-22 19:17:35 +01:00			`if @in_details_depth == 1 && !@in_summary`
			`@summary_contents = ""`
			`@in_summary = true`
			`end`

refactor 2013-05-28 09:48:47 +10:00			`end`
			`end`

			`def end_element(name)`
			`case name`
			`when "a"`
			`unless @strip_links`
Add rubocop to our build. (#5004) 2017-07-28 10:20:09 +09:00			`characters("</a>", false, false, false)`
refactor 2013-05-28 09:48:47 +10:00			`@in_a = false`
			`end`
			`when "p", "br"`
FIX: clean html before sending it to jquery for collapsing 2015-05-20 14:42:54 +10:00			`if @keep_newlines`
			`characters("<br>", false, false, false)`
			`else`
			`characters(" ")`
			`end`
refactor 2013-05-28 09:48:47 +10:00			`when "aside"`
			`@in_quote = false`
FIX: details tags broke excerpts 2017-12-19 17:28:55 -05:00			`when "details"`
FIX: support for generating excerpt when nesting <details> blocks 2018-01-22 19:17:35 +01:00			`@in_details_depth -= 1`
			`if @in_details_depth == 0`
			`full = "<details><summary>#{clean(@summary_contents)}</summary>#{clean(@detail_contents)}</details>"`
			`if @current_length + full.length > @length`
			`@excerpt << "<details class='disabled'><summary>#{@summary_contents[0..@length]}</summary></details>"`
			`else`
			`@excerpt << full`
			`end`
FIX: details tags broke excerpts 2017-12-19 17:28:55 -05:00			`end`
FIX: Don't disable details when below truncate limit 2017-12-20 15:44:36 -05:00			`when "summary"`
FIX: support for generating excerpt when nesting <details> blocks 2018-01-22 19:17:35 +01:00			`@in_summary = false if @in_details_depth == 1`
Preserve spoiler tags in post excerpts. 2014-02-20 14:18:30 +05:30			`when "div", "span"`
Feature: allow mods to cut pinned topic excerpts 2014-07-17 21:32:17 +10:00			`throw :done if @start_excerpt`
Preserve spoiler tags in post excerpts. 2014-02-20 14:18:30 +05:30			`characters("</span>", false, false, false) if @in_spoiler`
			`@in_spoiler = false`
refactor 2013-05-28 09:48:47 +10:00			`end`
			`end`

FIX: Don't disable details when below truncate limit 2017-12-20 15:44:36 -05:00			`def clean(str)`
			`ERB::Util.html_escape(str.strip)`
			`end`

refactor 2013-05-28 09:48:47 +10:00			`def characters(string, truncate = true, count_it = true, encode = true)`
			`return if @in_quote`
FIX: details tags broke excerpts 2017-12-19 17:28:55 -05:00
FIX: exception in excerpt parser for null nodes 2017-08-17 16:13:21 -04:00			`# we call length on this so might as well ensure we have a string`
			`string = string.to_s`
FIX: support for generating excerpt when nesting <details> blocks 2018-01-22 19:17:35 +01:00			`if @in_details_depth > 0`
FIX: details tags broke excerpts 2017-12-19 17:28:55 -05:00			`if @in_summary`
			`@summary_contents << string`
FIX: Don't disable details when below truncate limit 2017-12-20 15:44:36 -05:00			`else`
			`@detail_contents << string`
FIX: details tags broke excerpts 2017-12-19 17:28:55 -05:00			`end`
			`return`
			`end`
FIX: exception in excerpt parser for null nodes 2017-08-17 16:13:21 -04:00
Add rubocop to our build. (#5004) 2017-07-28 10:20:09 +09:00			`encode = encode ? lambda { \|s\| ERB::Util.html_escape(s) } : lambda { \|s\| s }`
refactor 2013-05-28 09:48:47 +10:00			`if count_it && @current_length + string.length > @length`
			`length = [0, @length - @current_length - 1].max`
			`@excerpt << encode.call(string[0..length]) if truncate`
Better HTML emails, smarter email digests, new email section in admin with digest preview 2013-06-03 16:12:24 -04:00			`@excerpt << (@text_entities ? "..." : "…")`
refactor 2013-05-28 09:48:47 +10:00			`@excerpt << "</a>" if @in_a`
			`throw :done`
			`end`
			`@excerpt << encode.call(string)`
			`@current_length += string.length if count_it`
			`end`
			`end`