FIX: Disallow table cells to be weighted actual articles can be main content (#27508)

For Topic Embeds, we would prefer <article> to be the main article in a topic, rather than a table cell <td> with potentially a lot of data. However, in an example URL like here, the table cell (the very large code snippet) is seen as the Topic Embed's article due to the determined content weight by the Readability library we use. In the newly released 0.7.1 cantino/ruby-readability#94, the library has a new option to exclude the library's default <td> element into content weighting. This is more in line with the original library where they only weighted <p>. So this PR excludes the td, as seen in the tests, to allow the actual article to be seen as the article. This PR also adds the details tag into the allow-list.
2025-02-25 18:55:32 -06:00 · 2024-06-19 09:50:49 +08:00
parent ebdbb199a5
commit 489aac3fdd
2 changed files with 77 additions and 5 deletions
--- a/spec/models/topic_embed_spec.rb
+++ b/spec/models/topic_embed_spec.rb
@@ -23,7 +23,7 @@ RSpec.describe TopicEmbed do
      expect(TopicEmbed.count).to eq(0)
    end

-    it "Allows figure and figcaption HTML tags" do
+    it "Allows figure, figcaption, details HTML tags" do
      html = <<~HTML
        <html>
        <head>
@@ -35,7 +35,10 @@ RSpec.describe TopicEmbed do
            <figure>
              <img src="/a.png">
              <figcaption>Some caption</figcaption>
-            <figure>
+            </figure>
+            <details>
+              some details
+            </details>
          </div>
        </body>
        </html>
@@ -51,13 +54,60 @@ RSpec.describe TopicEmbed do
            <figure>
              <img src="https://blog.discourse.com/a.png">
              <figcaption>Some caption</figcaption>
-            <figure>
-          </figure></figure></div>
+            </figure>
+            <details>
+              some details
+            </details>
+          </div>
        </div></div>
      HTML
      expect(parsed.body.strip).to eq(expected.strip)
    end

+    # ideally, articles get a heavier weightage than td elements
+    # so to force that, we do not allow td elements to be scored
+    it "does not score td tags" do
+      html = <<~HTML
+        <html>
+        <head>
+           <title>Some title</title>
+        </head>
+        <body>
+          <article>
+            article content
+            <table>
+              <tr>
+                <td>
+                  <p>cats</p>
+                  <p>cats</p>
+                </td>
+              </tr>
+            </table>
+          </article>
+        </body>
+        </html>
+      HTML
+
+      parsed = TopicEmbed.parse_html(html, "https://blog.discourse.com/somepost.html")
+
+      expected = <<-HTML
+        <div><div>
+  
+    article content
+    
+      
+        
+          cats
+          cats
+        
+      
+    
+  
+</div></div>
+      HTML
+      expect(parsed.body.strip).to eq(expected.strip)
+    end
+
    context "when creating a post" do
      let!(:post) { TopicEmbed.import(user, url, title, contents) }
      let(:topic_embed) { TopicEmbed.find_by(post: post) }