From df0446247568629c85da30c1fa8aff634c79af39 Mon Sep 17 00:00:00 2001 From: Sam Date: Tue, 23 Aug 2022 15:03:57 +1000 Subject: [PATCH] FIX: ignore malformed HTML for title extraction (#18040) Certain HTML can be rejected by nokogumbo, specifically cases where there are enormous amounts of attributes This ensures that malformed HTML is simply skipped instead of leaking out an exception and terminating downstream processes. --- lib/retrieve_title.rb | 9 ++++++++- spec/lib/retrieve_title_spec.rb | 12 ++++++++++++ 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/lib/retrieve_title.rb b/lib/retrieve_title.rb index 2a8ff118e3f..6de79fca7ad 100644 --- a/lib/retrieve_title.rb +++ b/lib/retrieve_title.rb @@ -18,8 +18,15 @@ module RetrieveTitle if html =~ // && html !~ /<\/title>/ return nil end - if doc = Nokogiri::HTML5(html, nil, encoding) + doc = nil + begin + doc = Nokogiri::HTML5(html, nil, encoding) + rescue ArgumentError + # invalid HTML (too many attributes) - ignore + end + + if doc title = doc.at('title')&.inner_text # A horrible hack - YouTube uses `document.title` to populate the title diff --git a/spec/lib/retrieve_title_spec.rb b/spec/lib/retrieve_title_spec.rb index 7edd62ff08a..b59ff8d1b20 100644 --- a/spec/lib/retrieve_title_spec.rb +++ b/spec/lib/retrieve_title_spec.rb @@ -51,6 +51,18 @@ RSpec.describe RetrieveTitle do ) expect(title).to eq("Video Title") end + + it "will not exception out for invalid html" do + attributes = (1..1000).map { |x| " attr#{x}='1' " }.join + title = RetrieveTitle.extract_title <<~HTML + <html> + <title>test + + + HTML + + expect(title).to eq(nil) + end end describe ".crawl" do