FIX: ignore malformed HTML for title extraction (#18040)

Certain HTML can be rejected by nokogumbo, specifically cases where there
are enormous amounts of attributes

This ensures that malformed HTML is simply skipped instead of leaking out
an exception and terminating downstream processes.
This commit is contained in:
Sam 2022-08-23 15:03:57 +10:00 committed by GitHub
parent 5d44c31bfa
commit df04462475
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 20 additions and 1 deletions

View File

@ -18,8 +18,15 @@ module RetrieveTitle
if html =~ /<title>/ && html !~ /<\/title>/
return nil
end
if doc = Nokogiri::HTML5(html, nil, encoding)
doc = nil
begin
doc = Nokogiri::HTML5(html, nil, encoding)
rescue ArgumentError
# invalid HTML (too many attributes) - ignore
end
if doc
title = doc.at('title')&.inner_text
# A horrible hack - YouTube uses `document.title` to populate the title

View File

@ -51,6 +51,18 @@ RSpec.describe RetrieveTitle do
)
expect(title).to eq("Video Title")
end
it "will not exception out for invalid html" do
attributes = (1..1000).map { |x| " attr#{x}='1' " }.join
title = RetrieveTitle.extract_title <<~HTML
<html>
<title>test</title>
<body #{attributes}>
</html>
HTML
expect(title).to eq(nil)
end
end
describe ".crawl" do