Closes #1308: Strip HTML tags from the content of "raw" nodes before feeding it to the search indexer.

2025-02-25 18:55:22 -06:00 · 2014-01-12 22:04:24 +01:00 · 2014-01-12 22:04:24 +01:00 · aff29be5b3
commit aff29be5b3
parent f314c40160
2 changed files with 14 additions and 3 deletions
--- a/3
+++ b/3
@ -88,6 +88,9 @@ Bugs fixed
 * #1299: Make behavior of the :rst:dir:`math` directive more consistent and
  avoid producing empty environments in LaTeX output.

+* #1308: Strip HTML tags from the content of "raw" nodes before feeding it
+  to the search indexer.
+
 Documentation
 -------------

--- a/sphinx/search/init.py
+++ b/sphinx/search/init.py
@ -10,10 +10,9 @@
 """
 from __future__ import with_statement
 import re
-import itertools
 import cPickle as pickle

-from docutils.nodes import comment, title, Text, NodeVisitor, SkipNode
+from docutils.nodes import raw, comment, title, Text, NodeVisitor, SkipNode

 from sphinx.util import jsdump, rpartition

@ -146,7 +145,16 @@ class WordCollector(NodeVisitor):
    def dispatch_visit(self, node):
        if node.__class__ is comment:
            raise SkipNode
-        elif node.__class__ is Text:
+        if node.__class__ is raw:
+            # Some people might put content in raw HTML that should be searched,
+            # so we just amateurishly strip HTML tags and index the remaining
+            # content
+            nodetext = re.sub(r'(?is)<style.*?</style>', '', node.astext())
+            nodetext = re.sub(r'(?is)<script.*?</script>', '', nodetext)
+            nodetext = re.sub(r'<[^<]+?>', '', nodetext)
+            self.found_words.extend(self.lang.split(nodetext))
+            raise SkipNode
+        if node.__class__ is Text:
            self.found_words.extend(self.lang.split(node.astext()))
        elif node.__class__ is title:
            self.found_title_words.extend(self.lang.split(node.astext()))