From df0446247568629c85da30c1fa8aff634c79af39 Mon Sep 17 00:00:00 2001
From: Sam <sam.saffron@gmail.com>
Date: Tue, 23 Aug 2022 15:03:57 +1000
Subject: [PATCH] FIX: ignore malformed HTML for title extraction (#18040)

Certain HTML can be rejected by nokogumbo, specifically cases where there
are enormous amounts of attributes

This ensures that malformed HTML is simply skipped instead of leaking out
an exception and terminating downstream processes.
---
 lib/retrieve_title.rb           |  9 ++++++++-
 spec/lib/retrieve_title_spec.rb | 12 ++++++++++++
 2 files changed, 20 insertions(+), 1 deletion(-)
diff --git a/lib/retrieve_title.rb b/lib/retrieve_title.rb
index 2a8ff118e3f..6de79fca7ad 100644
--- a/lib/retrieve_title.rb
+++ b/lib/retrieve_title.rb
@@ -18,8 +18,15 @@ module RetrieveTitle
     if html =~ /<title>/ && html !~ /<\/title>/
       return nil
     end
-    if doc = Nokogiri::HTML5(html, nil, encoding)
 
+    doc = nil
+    begin
+      doc = Nokogiri::HTML5(html, nil, encoding)
+    rescue ArgumentError
+      # invalid HTML (too many attributes) - ignore
+    end
+
+    if doc
       title = doc.at('title')&.inner_text
 
       # A horrible hack - YouTube uses `document.title` to populate the title
diff --git a/spec/lib/retrieve_title_spec.rb b/spec/lib/retrieve_title_spec.rb
index 7edd62ff08a..b59ff8d1b20 100644
--- a/spec/lib/retrieve_title_spec.rb
+++ b/spec/lib/retrieve_title_spec.rb
@@ -51,6 +51,18 @@ RSpec.describe RetrieveTitle do
       )
       expect(title).to eq("Video Title")
     end
+
+    it "will not exception out for invalid html" do
+      attributes = (1..1000).map { |x| " attr#{x}='1' " }.join
+      title = RetrieveTitle.extract_title <<~HTML
+        <html>
+          <title>test</title>
+          <body #{attributes}>
+        </html>
+      HTML
+
+      expect(title).to eq(nil)
+    end
   end
 
   describe ".crawl" do