From d27d7c8ccac49bd99ad54a73c8f16fe3e74b02a9 Mon Sep 17 00:00:00 2001 From: Chema Balsas Date: Wed, 11 Aug 2021 21:56:55 +0100 Subject: [PATCH] FIX: Unescapes hash section with present to account for url-encoded chars MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Sections with unreserverd characters will appear url-encoded and need to be unescaped before using it. Wikipedia generates 2 different spans in this case in the same page, one with an id resulting of replacing the % symbols with . and the other with the decoded version of the string. For example, for /wiki/foo#A%C3%A1A it will generate: AáA Unescaping the `m_url_hash_name` should work in all cases to target the proper section span. --- lib/onebox/engine/wikipedia_onebox.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/onebox/engine/wikipedia_onebox.rb b/lib/onebox/engine/wikipedia_onebox.rb index e86a9014334..6d7d40424a0 100644 --- a/lib/onebox/engine/wikipedia_onebox.rb +++ b/lib/onebox/engine/wikipedia_onebox.rb @@ -24,7 +24,7 @@ module Onebox end unless m_url_hash.nil? - section_header_title = raw.xpath("//span[@id='#{m_url_hash_name}']") + section_header_title = raw.xpath("//span[@id='#{CGI.unescape(m_url_hash_name)}']") if section_header_title.empty? paras = raw.search("p") # default get all the paras