Replace Hpricot with Nokogiri

This commit is contained in:
Jaime Iniesta
2013-02-12 09:46:45 -05:00
parent 84a167725d
commit 6995e75d41
15 changed files with 36 additions and 34 deletions

View File

@@ -9,7 +9,7 @@ class CookedPostProcessor
@dirty = false
@opts = opts
@post = post
@doc = Hpricot(post.cooked)
@doc = Nokogiri::HTML(post.cooked)
end
def dirty?

View File

@@ -34,7 +34,7 @@ module Oneboxer
if Whitelist.allowed?(url)
page_html = open(url).read
if page_html.present?
doc = Hpricot(page_html)
doc = Nokogiri::HTML(page_html)
# See if if it has an oembed thing we can use
(doc/"link[@type='application/json+oembed']").each do |oembed|
@@ -56,7 +56,7 @@ module Oneboxer
# Parse URLs out of HTML, returning the document when finished.
def self.each_onebox_link(string_or_doc)
doc = string_or_doc
doc = Hpricot(doc) if doc.is_a?(String)
doc = Nokogiri::HTML(doc) if doc.is_a?(String)
onebox_links = doc.search("a.onebox")
if onebox_links.present?

View File

@@ -22,19 +22,19 @@ module Oneboxer
end
def parse(data)
hp = Hpricot(data)
html_doc = Nokogiri::HTML(data)
result = {}
result[:title] = hp.at("h1")
result[:title] = html_doc.at("h1")
result[:title] = result[:title].inner_html if result[:title].present?
image = hp.at(".main-image img")
image = html_doc.at(".main-image img")
result[:image] = image['src'] if image
result[:by_info] = hp.at("#by-line")
result[:by_info] = html_doc.at("#by-line")
result[:by_info] = BaseOnebox.remove_whitespace(result[:by_info].inner_html) if result[:by_info].present?
summary = hp.at("#description-and-details-content")
summary = html_doc.at("#description-and-details-content")
result[:text] = summary.inner_html if summary.present?
result

View File

@@ -12,20 +12,20 @@ module Oneboxer
def parse(data)
hp = Hpricot(data)
html_doc = Nokogiri::HTML(data)
result = {}
m = hp.at("h1.doc-banner-title")
m = html_doc.at("h1.doc-banner-title")
result[:title] = m.inner_text if m
m = hp.at("div#doc-original-text")
m = html_doc.at("div#doc-original-text")
if m
result[:text] = BaseOnebox.replace_tags_with_spaces(m.inner_html)
result[:text] = result[:text][0..MAX_TEXT]
end
m = hp.at("div.doc-banner-icon img")
m = html_doc.at("div.doc-banner-icon img")
result[:image] = m['src'] if m
result

View File

@@ -17,17 +17,17 @@ module Oneboxer
def parse(data)
hp = Hpricot(data)
html_doc = Nokogiri::HTML(data)
result = {}
m = hp.at("h1")
m = html_doc.at("h1")
result[:title] = m.inner_text if m
m = hp.at("h4 ~ p")
m = html_doc.at("h4 ~ p")
result[:text] = m.inner_text[0..MAX_TEXT] if m
m = hp.at(".product img.artwork")
m = html_doc.at(".product img.artwork")
result[:image] = m['src'] if m
result

View File

@@ -9,7 +9,7 @@ module Oneboxer
page_html = open(@url).read
return nil if page_html.blank?
doc = Hpricot(page_html)
doc = Nokogiri::HTML(page_html)
# Flikrs oembed just stopped returning images for no reason. Let's use opengraph instead.
open_graph = Oneboxer.parse_open_graph(doc)

View File

@@ -20,23 +20,23 @@ module Oneboxer
def parse(data)
hp = Hpricot(data)
html_doc = Nokogiri::HTML(data)
result = {}
title = hp.at('title').inner_html
title = html_doc.at('title').inner_html
result[:title] = title.gsub!(/ - Wikipedia, the free encyclopedia/, '') if title.present?
# get the first image > 150 pix high
images = hp.search("img").select { |img| img['height'].to_i > 150 }
images = html_doc.search("img").select { |img| img['height'].to_i > 150 }
result[:image] = "http:#{images[0]["src"]}" unless images.empty?
# remove the table from mobile layout, as it can contain paras in some rare cases
hp.search("table").remove
html_doc.search("table").remove
# get all the paras
paras = hp.search("p")
paras = html_doc.search("p")
text = ""
unless paras.empty?