mirror of
https://github.com/discourse/discourse.git
synced 2024-11-25 10:20:58 -06:00
FIX: ignore malformed HTML for title extraction (#18040)
Certain HTML can be rejected by nokogumbo, specifically cases where there are enormous amounts of attributes This ensures that malformed HTML is simply skipped instead of leaking out an exception and terminating downstream processes.
This commit is contained in:
parent
5d44c31bfa
commit
df04462475
@ -18,8 +18,15 @@ module RetrieveTitle
|
||||
if html =~ /<title>/ && html !~ /<\/title>/
|
||||
return nil
|
||||
end
|
||||
if doc = Nokogiri::HTML5(html, nil, encoding)
|
||||
|
||||
doc = nil
|
||||
begin
|
||||
doc = Nokogiri::HTML5(html, nil, encoding)
|
||||
rescue ArgumentError
|
||||
# invalid HTML (too many attributes) - ignore
|
||||
end
|
||||
|
||||
if doc
|
||||
title = doc.at('title')&.inner_text
|
||||
|
||||
# A horrible hack - YouTube uses `document.title` to populate the title
|
||||
|
@ -51,6 +51,18 @@ RSpec.describe RetrieveTitle do
|
||||
)
|
||||
expect(title).to eq("Video Title")
|
||||
end
|
||||
|
||||
it "will not exception out for invalid html" do
|
||||
attributes = (1..1000).map { |x| " attr#{x}='1' " }.join
|
||||
title = RetrieveTitle.extract_title <<~HTML
|
||||
<html>
|
||||
<title>test</title>
|
||||
<body #{attributes}>
|
||||
</html>
|
||||
HTML
|
||||
|
||||
expect(title).to eq(nil)
|
||||
end
|
||||
end
|
||||
|
||||
describe ".crawl" do
|
||||
|
Loading…
Reference in New Issue
Block a user