mirror of
https://github.com/discourse/discourse.git
synced 2024-12-01 13:09:33 -06:00
283b08d45f
* Move onebox gem in core library * Update template file path * Remove warning for onebox gem caching * Remove onebox version file * Remove onebox gem * Add sanitize gem * Require onebox library in lazy-yt plugin * Remove onebox web specific code This code was used in standalone onebox Sinatra application * Merge Discourse specific AllowlistedGenericOnebox engine in core * Fix onebox engine filenames to match class name casing * Move onebox specs from gem into core * DEV: Rename `response` helper to `onebox_response` Fixes a naming collision. * Require rails_helper * Don't use `before/after(:all)` * Whitespace * Remove fakeweb * Remove poor unit tests * DEV: Re-add fakeweb, plugins are using it * Move onebox helpers * Stub Instagram API * FIX: Follow additional redirect status codes (#476) Don’t throw errors if we encounter 303, 307 or 308 HTTP status codes in responses * Remove an empty file * DEV: Update the license file Using the copy from https://choosealicense.com/licenses/gpl-2.0/# Hopefully this will enable GitHub to show the license UI? * DEV: Update embedded copyrights * DEV: Add Onebox copyright notice * DEV: Add MIT license, convert COPYRIGHT.txt to md * DEV: Remove an incorrect copyright claim Co-authored-by: Jarek Radosz <jradosz@gmail.com> Co-authored-by: jbrw <jamie@goatforce5.org>
98 lines
3.6 KiB
Ruby
98 lines
3.6 KiB
Ruby
# frozen_string_literal: true
|
|
|
|
module Onebox
|
|
module Engine
|
|
class WikipediaOnebox
|
|
include Engine
|
|
include LayoutSupport
|
|
include HTML
|
|
|
|
matches_regexp(/^https?:\/\/.*\.wikipedia\.(com|org)/)
|
|
always_https
|
|
|
|
private
|
|
|
|
def data
|
|
paras = []
|
|
text = ""
|
|
|
|
# Detect section Hash in the url and retrive the related paragraphs. if no hash provided the first few paragraphs will be used
|
|
# Author Lidlanca
|
|
# Date 9/8/2014
|
|
if (m_url_hash = @url.match(/#([^\/?]+)/)) # extract url hash
|
|
m_url_hash_name = m_url_hash[1]
|
|
end
|
|
|
|
unless m_url_hash.nil?
|
|
section_header_title = raw.xpath("//span[@id='#{m_url_hash_name}']")
|
|
|
|
if section_header_title.empty?
|
|
paras = raw.search("p") # default get all the paras
|
|
else
|
|
section_title_text = section_header_title.inner_text
|
|
section_header = section_header_title[0].parent # parent element of the section span element should be an <h3> node
|
|
cur_element = section_header
|
|
|
|
# p|text|div covers the general case. We assume presence of at least 1 P node. if section has no P node we may end up with a P node from the next section.
|
|
# div tag is commonly used as an assets wraper in an article section. often as the first element holding an image.
|
|
# ul support will imporve the output generated for a section with a list as the main content (for example: an Author Bibliography, A musician Discography, etc)
|
|
first_p_found = nil
|
|
while (((next_sibling = cur_element.next_sibling).name =~ /p|text|div|ul/) || first_p_found.nil?) do # from section header get the next sibling until it is a breaker tag
|
|
cur_element = next_sibling
|
|
if (cur_element.name == "p" || cur_element.name == "ul") #we treat a list as we detect a p to avoid showing
|
|
first_p_found = true
|
|
paras.push(cur_element)
|
|
end
|
|
end
|
|
end
|
|
else # no hash found in url
|
|
paras = raw.search("p") # default get all the paras
|
|
end
|
|
|
|
unless paras.empty?
|
|
cnt = 0
|
|
while text.length < Onebox::LayoutSupport.max_text && cnt <= 3
|
|
break if cnt >= paras.size
|
|
text += " " unless cnt == 0
|
|
|
|
if paras[cnt].name == "ul" # Handle UL tag. Generate a textual ordered list (1.item | 2.item | 3.item). Unfortunately no newline allowed in output
|
|
li_index = 1
|
|
list_items = []
|
|
paras[cnt].children.css("li").each { |li| list_items.push "#{li_index}." + li.inner_text ; li_index += 1 }
|
|
paragraph = (list_items.join " |\n ")[0..Onebox::LayoutSupport.max_text]
|
|
else
|
|
paragraph = paras[cnt].inner_text[0..Onebox::LayoutSupport.max_text]
|
|
end
|
|
|
|
paragraph.gsub!(/\[\d+\]/mi, "")
|
|
text += paragraph
|
|
cnt += 1
|
|
end
|
|
end
|
|
|
|
text = "#{text[0..Onebox::LayoutSupport.max_text]}..." if text.length > Onebox::LayoutSupport.max_text
|
|
|
|
result = {
|
|
link: link,
|
|
title: raw.css("html body h1").inner_text + (section_title_text ? " | " + section_title_text : ""), #if a section sub title exists add it to the main article title
|
|
description: text
|
|
}
|
|
|
|
img = raw.css(".image img")
|
|
|
|
if img && img.size > 0
|
|
img.each do |i|
|
|
src = i["src"]
|
|
if src !~ /Question_book/
|
|
result[:image] = src
|
|
break
|
|
end
|
|
end
|
|
end
|
|
|
|
result
|
|
end
|
|
end
|
|
end
|
|
end
|