mirror of
https://github.com/sphinx-doc/sphinx.git
synced 2025-02-25 18:55:22 -06:00
Closes #1308: Strip HTML tags from the content of "raw" nodes before feeding it to the search indexer.
This commit is contained in:
parent
f314c40160
commit
aff29be5b3
3
CHANGES
3
CHANGES
@ -88,6 +88,9 @@ Bugs fixed
|
||||
* #1299: Make behavior of the :rst:dir:`math` directive more consistent and
|
||||
avoid producing empty environments in LaTeX output.
|
||||
|
||||
* #1308: Strip HTML tags from the content of "raw" nodes before feeding it
|
||||
to the search indexer.
|
||||
|
||||
Documentation
|
||||
-------------
|
||||
|
||||
|
@ -10,10 +10,9 @@
|
||||
"""
|
||||
from __future__ import with_statement
|
||||
import re
|
||||
import itertools
|
||||
import cPickle as pickle
|
||||
|
||||
from docutils.nodes import comment, title, Text, NodeVisitor, SkipNode
|
||||
from docutils.nodes import raw, comment, title, Text, NodeVisitor, SkipNode
|
||||
|
||||
from sphinx.util import jsdump, rpartition
|
||||
|
||||
@ -146,7 +145,16 @@ class WordCollector(NodeVisitor):
|
||||
def dispatch_visit(self, node):
|
||||
if node.__class__ is comment:
|
||||
raise SkipNode
|
||||
elif node.__class__ is Text:
|
||||
if node.__class__ is raw:
|
||||
# Some people might put content in raw HTML that should be searched,
|
||||
# so we just amateurishly strip HTML tags and index the remaining
|
||||
# content
|
||||
nodetext = re.sub(r'(?is)<style.*?</style>', '', node.astext())
|
||||
nodetext = re.sub(r'(?is)<script.*?</script>', '', nodetext)
|
||||
nodetext = re.sub(r'<[^<]+?>', '', nodetext)
|
||||
self.found_words.extend(self.lang.split(nodetext))
|
||||
raise SkipNode
|
||||
if node.__class__ is Text:
|
||||
self.found_words.extend(self.lang.split(node.astext()))
|
||||
elif node.__class__ is title:
|
||||
self.found_title_words.extend(self.lang.split(node.astext()))
|
||||
|
Loading…
Reference in New Issue
Block a user