Closes #1308: Strip HTML tags from the content of "raw" nodes before feeding it to the search indexer.

This commit is contained in:
Georg Brandl 2014-01-12 22:04:24 +01:00
parent f314c40160
commit aff29be5b3
2 changed files with 14 additions and 3 deletions

View File

@ -88,6 +88,9 @@ Bugs fixed
* #1299: Make behavior of the :rst:dir:`math` directive more consistent and
avoid producing empty environments in LaTeX output.
* #1308: Strip HTML tags from the content of "raw" nodes before feeding it
to the search indexer.
Documentation
-------------

View File

@ -10,10 +10,9 @@
"""
from __future__ import with_statement
import re
import itertools
import cPickle as pickle
from docutils.nodes import comment, title, Text, NodeVisitor, SkipNode
from docutils.nodes import raw, comment, title, Text, NodeVisitor, SkipNode
from sphinx.util import jsdump, rpartition
@ -146,7 +145,16 @@ class WordCollector(NodeVisitor):
def dispatch_visit(self, node):
if node.__class__ is comment:
raise SkipNode
elif node.__class__ is Text:
if node.__class__ is raw:
# Some people might put content in raw HTML that should be searched,
# so we just amateurishly strip HTML tags and index the remaining
# content
nodetext = re.sub(r'(?is)<style.*?</style>', '', node.astext())
nodetext = re.sub(r'(?is)<script.*?</script>', '', nodetext)
nodetext = re.sub(r'<[^<]+?>', '', nodetext)
self.found_words.extend(self.lang.split(nodetext))
raise SkipNode
if node.__class__ is Text:
self.found_words.extend(self.lang.split(node.astext()))
elif node.__class__ is title:
self.found_title_words.extend(self.lang.split(node.astext()))