2013-05-27 18:48:47 -05:00
|
|
|
class ExcerptParser < Nokogiri::XML::SAX::Document
|
|
|
|
|
|
|
|
attr_reader :excerpt
|
|
|
|
|
2014-09-04 00:03:12 -05:00
|
|
|
SPAN_REGEX = /<\s*span[^>]*class\s*=\s*['|"]excerpt['|"][^>]*>/
|
|
|
|
|
2013-06-03 15:12:24 -05:00
|
|
|
def initialize(length, options=nil)
|
2013-05-27 18:48:47 -05:00
|
|
|
@length = length
|
|
|
|
@excerpt = ""
|
|
|
|
@current_length = 0
|
2013-06-03 15:12:24 -05:00
|
|
|
options || {}
|
2013-05-27 18:48:47 -05:00
|
|
|
@strip_links = options[:strip_links] == true
|
2013-06-03 15:12:24 -05:00
|
|
|
@text_entities = options[:text_entities] == true
|
2013-06-05 17:54:46 -05:00
|
|
|
@markdown_images = options[:markdown_images] == true
|
2015-05-19 23:42:54 -05:00
|
|
|
@keep_newlines = options[:keep_newlines] == true
|
2015-07-23 10:02:03 -05:00
|
|
|
@keep_emojis = options[:keep_emojis] == true
|
2014-07-17 06:32:17 -05:00
|
|
|
@start_excerpt = false
|
2013-05-27 18:48:47 -05:00
|
|
|
end
|
|
|
|
|
|
|
|
def self.get_excerpt(html, length, options)
|
2014-09-04 00:03:12 -05:00
|
|
|
html ||= ''
|
2014-12-10 05:52:51 -06:00
|
|
|
length = html.length if html.include?('excerpt') && SPAN_REGEX === html
|
2014-09-04 00:03:12 -05:00
|
|
|
me = self.new(length, options)
|
2013-05-27 18:48:47 -05:00
|
|
|
parser = Nokogiri::HTML::SAX::Parser.new(me)
|
|
|
|
catch(:done) do
|
2014-09-04 00:03:12 -05:00
|
|
|
parser.parse(html)
|
2013-05-27 18:48:47 -05:00
|
|
|
end
|
2014-12-10 05:52:51 -06:00
|
|
|
excerpt = me.excerpt.strip
|
|
|
|
excerpt = CGI.unescapeHTML(excerpt) if options[:text_entities] == true
|
|
|
|
excerpt
|
2013-05-27 18:48:47 -05:00
|
|
|
end
|
|
|
|
|
2014-07-24 21:15:43 -05:00
|
|
|
def escape_attribute(v)
|
2014-10-06 19:37:27 -05:00
|
|
|
return "" unless v
|
|
|
|
|
|
|
|
v = v.dup
|
|
|
|
v.gsub!("&", "&")
|
|
|
|
v.gsub!("\"", """)
|
|
|
|
v.gsub!("<", "<")
|
|
|
|
v.gsub!(">", ">")
|
|
|
|
v
|
2014-07-24 21:15:43 -05:00
|
|
|
end
|
|
|
|
|
2013-06-05 17:54:46 -05:00
|
|
|
def include_tag(name, attributes)
|
2014-07-24 21:15:43 -05:00
|
|
|
characters("<#{name} #{attributes.map{|k,v| "#{k}=\"#{escape_attribute(v)}\""}.join(' ')}>", false, false, false)
|
2013-06-05 17:54:46 -05:00
|
|
|
end
|
|
|
|
|
2013-05-27 18:48:47 -05:00
|
|
|
def start_element(name, attributes=[])
|
|
|
|
case name
|
|
|
|
when "img"
|
2013-06-05 17:54:46 -05:00
|
|
|
|
2015-07-23 10:02:03 -05:00
|
|
|
attributes = Hash[*attributes.flatten]
|
|
|
|
|
|
|
|
if @keep_emojis && attributes["class"] == 'emoji'
|
|
|
|
return include_tag(name, attributes)
|
|
|
|
end
|
|
|
|
|
2013-06-05 17:54:46 -05:00
|
|
|
# If include_images is set, include the image in markdown
|
|
|
|
characters("!") if @markdown_images
|
|
|
|
|
2013-05-27 18:48:47 -05:00
|
|
|
if attributes["alt"]
|
|
|
|
characters("[#{attributes["alt"]}]")
|
|
|
|
elsif attributes["title"]
|
|
|
|
characters("[#{attributes["title"]}]")
|
|
|
|
else
|
2015-01-23 03:57:01 -06:00
|
|
|
characters("[#{I18n.t 'excerpt_image'}]")
|
2013-05-27 18:48:47 -05:00
|
|
|
end
|
2013-06-05 17:54:46 -05:00
|
|
|
|
|
|
|
characters("(#{attributes['src']})") if @markdown_images
|
|
|
|
|
2013-05-27 18:48:47 -05:00
|
|
|
when "a"
|
|
|
|
unless @strip_links
|
2013-06-05 17:54:46 -05:00
|
|
|
include_tag(name, attributes)
|
2013-05-27 18:48:47 -05:00
|
|
|
@in_a = true
|
|
|
|
end
|
2014-02-20 02:48:30 -06:00
|
|
|
|
2013-05-27 18:48:47 -05:00
|
|
|
when "aside"
|
|
|
|
@in_quote = true
|
2014-02-20 02:48:30 -06:00
|
|
|
|
|
|
|
when "div", "span"
|
2014-07-17 06:32:17 -05:00
|
|
|
if attributes.include?(["class", "excerpt"])
|
2014-09-03 02:12:56 -05:00
|
|
|
@excerpt = ""
|
|
|
|
@current_length = 0
|
2014-07-17 06:32:17 -05:00
|
|
|
@start_excerpt = true
|
|
|
|
end
|
2014-02-20 02:48:30 -06:00
|
|
|
# Preserve spoilers
|
2014-07-17 06:32:17 -05:00
|
|
|
if attributes.include?(["class", "spoiler"])
|
2014-02-20 02:48:30 -06:00
|
|
|
include_tag("span", attributes)
|
|
|
|
@in_spoiler = true
|
|
|
|
end
|
2013-05-27 18:48:47 -05:00
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
def end_element(name)
|
|
|
|
case name
|
|
|
|
when "a"
|
|
|
|
unless @strip_links
|
|
|
|
characters("</a>",false, false, false)
|
|
|
|
@in_a = false
|
|
|
|
end
|
|
|
|
when "p", "br"
|
2015-05-19 23:42:54 -05:00
|
|
|
if @keep_newlines
|
|
|
|
characters("<br>", false, false, false)
|
|
|
|
else
|
|
|
|
characters(" ")
|
|
|
|
end
|
2013-05-27 18:48:47 -05:00
|
|
|
when "aside"
|
|
|
|
@in_quote = false
|
2014-02-20 02:48:30 -06:00
|
|
|
when "div", "span"
|
2014-07-17 06:32:17 -05:00
|
|
|
throw :done if @start_excerpt
|
2014-02-20 02:48:30 -06:00
|
|
|
characters("</span>", false, false, false) if @in_spoiler
|
|
|
|
@in_spoiler = false
|
2013-05-27 18:48:47 -05:00
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
def characters(string, truncate = true, count_it = true, encode = true)
|
|
|
|
return if @in_quote
|
|
|
|
encode = encode ? lambda{|s| ERB::Util.html_escape(s)} : lambda {|s| s}
|
|
|
|
if count_it && @current_length + string.length > @length
|
|
|
|
length = [0, @length - @current_length - 1].max
|
|
|
|
@excerpt << encode.call(string[0..length]) if truncate
|
2013-06-03 15:12:24 -05:00
|
|
|
@excerpt << (@text_entities ? "..." : "…")
|
2013-05-27 18:48:47 -05:00
|
|
|
@excerpt << "</a>" if @in_a
|
|
|
|
throw :done
|
|
|
|
end
|
|
|
|
@excerpt << encode.call(string)
|
|
|
|
@current_length += string.length if count_it
|
|
|
|
end
|
|
|
|
end
|