Closes #1308: Strip HTML tags from the content of "raw" nodes before feeding it to the search indexer.

2025-02-25 18:55:22 -06:00 · 2014-01-12 22:04:24 +01:00
parent f314c40160
commit aff29be5b3
2 changed files with 14 additions and 3 deletions
--- a/3
+++ b/3
@@ -88,6 +88,9 @@ Bugs fixed
 * #1299: Make behavior of the :rst:dir:`math` directive more consistent and
  avoid producing empty environments in LaTeX output.
 * #1308: Strip HTML tags from the content of "raw" nodes before feeding it
  to the search indexer.
 Documentation
 -------------
--- a/sphinx/search/init.py
+++ b/sphinx/search/init.py
@@ -10,10 +10,9 @@
 """
 from __future__ import with_statement
 import re
 import itertools
 import cPickle as pickle
-from docutils.nodes import comment, title, Text, NodeVisitor, SkipNode
+from docutils.nodes import raw, comment, title, Text, NodeVisitor, SkipNode
 from sphinx.util import jsdump, rpartition
@@ -146,7 +145,16 @@ class WordCollector(NodeVisitor):
    def dispatch_visit(self, node):
        if node.__class__ is comment:
            raise SkipNode
-        elif node.__class__ is Text:
+        if node.__class__ is raw:
            # Some people might put content in raw HTML that should be searched,
            # so we just amateurishly strip HTML tags and index the remaining
            # content
            nodetext = re.sub(r'(?is)<style.*?</style>', '', node.astext())
            nodetext = re.sub(r'(?is)<script.*?</script>', '', nodetext)
            nodetext = re.sub(r'<[^<]+?>', '', nodetext)
            self.found_words.extend(self.lang.split(nodetext))
            raise SkipNode
        if node.__class__ is Text:
            self.found_words.extend(self.lang.split(node.astext()))
        elif node.__class__ is title:
            self.found_title_words.extend(self.lang.split(node.astext()))