Fix #3045: HTML search index creator should ignore "raw" content if now html

2025-02-25 18:55:22 -06:00 · 2016-10-17 16:06:45 +09:00 · 2016-10-17 16:06:45 +09:00 · 53ea1cb280
commit 53ea1cb280
parent 78d96b4abb
4 changed files with 27 additions and 9 deletions
--- a/1
+++ b/1
@ -53,6 +53,7 @@ Bugs fixed
 * #3031: incompatibility with LaTeX package ``tocloft``
 * #3003: literal blocks in footnotes are not supported by Latex
 * #3047: spacing before footnote in pdf output is not coherent and allows breaks
+* #3045: HTML search index creator should ignore "raw" content if now html

 Testing
 --------
--- a/sphinx/search/init.py
+++ b/sphinx/search/init.py
@ -196,13 +196,14 @@ class WordCollector(NodeVisitor):
        if issubclass(nodetype, comment):
            raise SkipNode
        if issubclass(nodetype, raw):
-            # Some people might put content in raw HTML that should be searched,
-            # so we just amateurishly strip HTML tags and index the remaining
-            # content
-            nodetext = re.sub(r'(?is)<style.*?</style>', '', node.astext())
-            nodetext = re.sub(r'(?is)<script.*?</script>', '', nodetext)
-            nodetext = re.sub(r'<[^<]+?>', '', nodetext)
-            self.found_words.extend(self.lang.split(nodetext))
+            if 'html' in node.get('format', '').split():
+                # Some people might put content in raw HTML that should be searched,
+                # so we just amateurishly strip HTML tags and index the remaining
+                # content
+                nodetext = re.sub(r'(?is)<style.*?</style>', '', node.astext())
+                nodetext = re.sub(r'(?is)<script.*?</script>', '', nodetext)
+                nodetext = re.sub(r'<[^<]+?>', '', nodetext)
+                self.found_words.extend(self.lang.split(nodetext))
            raise SkipNode
        if issubclass(nodetype, Text):
            self.found_words.extend(self.lang.split(node.astext()))
--- a/tests/roots/test-search/index.rst
+++ b/tests/roots/test-search/index.rst
@ -17,4 +17,12 @@ textinheading

 .. toctree::

-   tocitem
+   tocitem
+
+.. raw:: html
+
+   <span class="raw">rawword"</span>
+
+.. raw:: latex
+
+   latex_keyword
--- a/tests/test_search.py
+++ b/tests/test_search.py
@ -114,4 +114,12 @@ def test_term_in_heading_and_section(app, status, warning):
    # both documents should be a hit in the search index as a title,
    # respectively text hit
    assert 'textinhead:1' in searchindex
-    assert 'textinhead:0' in searchindex
+    assert 'textinhead:0' in searchindex
+
+
+@with_app(testroot='search')
+def test_term_in_raw_directive(app, status, warning):
+    searchindex = jsload(app.outdir / 'searchindex.js')
+    assert not is_registered_term(searchindex, 'raw')
+    assert is_registered_term(searchindex, 'rawword')
+    assert not is_registered_term(searchindex, 'latex_keyword')