FIX: ignore malformed HTML for title extraction (#18040)

Certain HTML can be rejected by nokogumbo, specifically cases where there are enormous amounts of attributes This ensures that malformed HTML is simply skipped instead of leaking out an exception and terminating downstream processes.
2025-02-25 18:55:32 -06:00 · 2022-08-23 15:03:57 +10:00
parent 5d44c31bfa
commit df04462475
2 changed files with 20 additions and 1 deletions
--- a/lib/retrieve_title.rb
+++ b/lib/retrieve_title.rb
@@ -18,8 +18,15 @@ module RetrieveTitle
    if html =~ /<title>/ && html !~ /<\/title>/
      return nil
    end
-    if doc = Nokogiri::HTML5(html, nil, encoding)

+    doc = nil
+    begin
+      doc = Nokogiri::HTML5(html, nil, encoding)
+    rescue ArgumentError
+      # invalid HTML (too many attributes) - ignore
+    end
+
+    if doc
      title = doc.at('title')&.inner_text

      # A horrible hack - YouTube uses `document.title` to populate the title
--- a/spec/lib/retrieve_title_spec.rb
+++ b/spec/lib/retrieve_title_spec.rb
@@ -51,6 +51,18 @@ RSpec.describe RetrieveTitle do
      )
      expect(title).to eq("Video Title")
    end
+
+    it "will not exception out for invalid html" do
+      attributes = (1..1000).map { |x| " attr#{x}='1' " }.join
+      title = RetrieveTitle.extract_title <<~HTML
+        <html>
+          <title>test</title>
+          <body #{attributes}>
+        </html>
+      HTML
+
+      expect(title).to eq(nil)
+    end
  end

  describe ".crawl" do