mirror of
https://github.com/sphinx-doc/sphinx.git
synced 2025-02-25 18:55:22 -06:00
Improve relevance scoring in HTML search results (#12441)
Co-authored-by: Will Lachance <wlach@protonmail.com> Co-authored-by: Bénédikt Tran <10796600+picnixz@users.noreply.github.com>
This commit is contained in:
parent
e7beb8bc5c
commit
91c5cd3abd
@ -112,6 +112,10 @@ Bugs fixed
|
||||
* #12425: Use Docutils' SVG processing in the HTML builder
|
||||
and remove Sphinx's custom logic.
|
||||
Patch by Tunç Başar Köse.
|
||||
* #12391: Adjust scoring of matches during HTML search so that document main
|
||||
titles tend to rank higher than subsection titles. In addition, boost matches
|
||||
on the name of programming domain objects relative to title/subtitle matches.
|
||||
Patch by James Addison and Will Lachance.
|
||||
|
||||
Testing
|
||||
-------
|
||||
|
@ -328,13 +328,14 @@ const Search = {
|
||||
for (const [title, foundTitles] of Object.entries(allTitles)) {
|
||||
if (title.toLowerCase().trim().includes(queryLower) && (queryLower.length >= title.length/2)) {
|
||||
for (const [file, id] of foundTitles) {
|
||||
let score = Math.round(100 * queryLower.length / title.length)
|
||||
const score = Math.round(Scorer.title * queryLower.length / title.length);
|
||||
const boost = titles[file] === title ? 1 : 0; // add a boost for document titles
|
||||
normalResults.push([
|
||||
docNames[file],
|
||||
titles[file] !== title ? `${titles[file]} > ${title}` : title,
|
||||
id !== null ? "#" + id : "",
|
||||
null,
|
||||
score,
|
||||
score + boost,
|
||||
filenames[file],
|
||||
]);
|
||||
}
|
||||
|
1
tests/js/fixtures/titles/searchindex.js
Normal file
1
tests/js/fixtures/titles/searchindex.js
Normal file
@ -0,0 +1 @@
|
||||
Search.setIndex({"alltitles": {"Main Page": [[0, null]], "Relevance": [[0, "relevance"], [1, null]]}, "docnames": ["index", "relevance"], "envversion": {"sphinx": 61, "sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2}, "filenames": ["index.rst", "relevance.rst"], "indexentries": {"example (class in relevance)": [[0, "relevance.Example", false]], "module": [[0, "module-relevance", false]], "relevance": [[0, "module-relevance", false]], "relevance (relevance.example attribute)": [[0, "relevance.Example.relevance", false]]}, "objects": {"": [[0, 0, 0, "-", "relevance"]], "relevance": [[0, 1, 1, "", "Example"]], "relevance.Example": [[0, 2, 1, "", "relevance"]]}, "objnames": {"0": ["py", "module", "Python module"], "1": ["py", "class", "Python class"], "2": ["py", "attribute", "Python attribute"]}, "objtypes": {"0": "py:module", "1": "py:class", "2": "py:attribute"}, "terms": {"": [0, 1], "A": 1, "For": 1, "In": [0, 1], "against": 0, "also": 1, "an": 0, "answer": 0, "appear": 1, "ar": 1, "area": 0, "ask": 0, "attribut": 0, "built": 1, "can": [0, 1], "class": 0, "code": [0, 1], "consid": 1, "contain": 0, "context": 0, "corpu": 1, "could": 1, "demonstr": 0, "describ": 1, "detail": 1, "determin": 1, "docstr": 0, "document": [0, 1], "domain": 1, "engin": 0, "exampl": [0, 1], "extract": 0, "find": 0, "found": 0, "from": 0, "function": 1, "ha": 1, "handl": 0, "happen": 1, "head": 0, "help": 0, "highli": 1, "how": 0, "i": [0, 1], "improv": 0, "inform": 0, "intend": 0, "issu": 1, "itself": 1, "knowledg": 0, "languag": 1, "less": 1, "like": [0, 1], "match": 0, "mention": 1, "name": [0, 1], "object": 0, "one": 1, "onli": 1, "other": 0, "page": 1, "part": 1, "particular": 0, "printf": 1, "program": 1, "project": 0, "queri": [0, 1], "question": 0, "re": 0, "rel": 0, "research": 0, "result": 1, "sai": 0, "same": 1, "score": 0, "search": [0, 1], "seem": 0, "softwar": 1, "some": 1, "sphinx": 0, "straightforward": 1, "subject": 0, "subsect": 0, "term": [0, 1], "test": 0, "text": 0, "than": 1, "thei": 0, "them": 0, "thi": 0, "titl": 0, "user": [0, 1], "we": [0, 1], "when": 0, "whether": 1, "within": 0, "would": 1}, "titles": ["Main Page", "Relevance"], "titleterms": {"main": 0, "page": 0, "relev": [0, 1]}})
|
6
tests/js/roots/titles/conf.py
Normal file
6
tests/js/roots/titles/conf.py
Normal file
@ -0,0 +1,6 @@
|
||||
import os
|
||||
import sys
|
||||
|
||||
sys.path.insert(0, os.path.abspath('.'))
|
||||
|
||||
extensions = ['sphinx.ext.autodoc']
|
20
tests/js/roots/titles/index.rst
Normal file
20
tests/js/roots/titles/index.rst
Normal file
@ -0,0 +1,20 @@
|
||||
Main Page
|
||||
=========
|
||||
|
||||
This is the main page of the ``titles`` test project.
|
||||
|
||||
In particular, this test project is intended to demonstrate how Sphinx
|
||||
can handle scoring of query matches against document titles and subsection
|
||||
heading titles relative to other document matches such as terms found within
|
||||
document text and object names extracted from code.
|
||||
|
||||
Relevance
|
||||
---------
|
||||
|
||||
In the context of search engines, we can say that a document is **relevant**
|
||||
to a user's query when it contains information that seems likely to help them
|
||||
find an answer to a question they're asking, or to improve their knowledge of
|
||||
the subject area they're researching.
|
||||
|
||||
.. automodule:: relevance
|
||||
:members:
|
7
tests/js/roots/titles/relevance.py
Normal file
7
tests/js/roots/titles/relevance.py
Normal file
@ -0,0 +1,7 @@
|
||||
class Example:
|
||||
"""Example class"""
|
||||
num_attribute = 5
|
||||
text_attribute = "string"
|
||||
|
||||
relevance = "testing"
|
||||
"""attribute docstring"""
|
13
tests/js/roots/titles/relevance.rst
Normal file
13
tests/js/roots/titles/relevance.rst
Normal file
@ -0,0 +1,13 @@
|
||||
Relevance
|
||||
=========
|
||||
|
||||
In some domains, it can be straightforward to determine whether a search result
|
||||
is relevant to the user's query.
|
||||
|
||||
For example, if we are in a software programming language domain, and a user
|
||||
has issued a query for the term ``printf``, then we could consider a document
|
||||
in the corpus that describes a built-in language function with the same name
|
||||
as (highly) relevant. A document that only happens to mention the ``printf``
|
||||
function name as part of some example code that appears on the page would
|
||||
also be relevant, but likely less relevant than the one that describes the
|
||||
function itself in detail.
|
@ -7,6 +7,23 @@ describe('Basic html theme search', function() {
|
||||
return req.responseText;
|
||||
}
|
||||
|
||||
function checkRanking(expectedRanking, results) {
|
||||
let [nextExpected, ...remainingItems] = expectedRanking;
|
||||
|
||||
for (result of results.reverse()) {
|
||||
if (!nextExpected) break;
|
||||
|
||||
let [expectedPage, expectedTitle, expectedTarget] = nextExpected;
|
||||
let [page, title, target] = result;
|
||||
|
||||
if (page == expectedPage && title == expectedTitle && target == expectedTarget) {
|
||||
[nextExpected, ...remainingItems] = remainingItems;
|
||||
}
|
||||
}
|
||||
|
||||
expect(remainingItems.length).toEqual(0);
|
||||
}
|
||||
|
||||
describe('terms search', function() {
|
||||
|
||||
it('should find "C++" when in index', function() {
|
||||
@ -76,7 +93,7 @@ describe('Basic html theme search', function() {
|
||||
'Main Page',
|
||||
'',
|
||||
null,
|
||||
100,
|
||||
16,
|
||||
'index.rst'
|
||||
]
|
||||
];
|
||||
@ -85,6 +102,66 @@ describe('Basic html theme search', function() {
|
||||
|
||||
});
|
||||
|
||||
describe('search result ranking', function() {
|
||||
|
||||
/*
|
||||
* These tests should not proscribe precise expected ordering of search
|
||||
* results; instead each test case should describe a single relevance rule
|
||||
* that helps users to locate relevant information efficiently.
|
||||
*
|
||||
* If you think that one of the rules seems to be poorly-defined or is
|
||||
* limiting the potential for search algorithm improvements, please check
|
||||
* for existing discussion/bugreports related to it on GitHub[1] before
|
||||
* creating one yourself. Suggestions for possible improvements are also
|
||||
* welcome.
|
||||
*
|
||||
* [1] - https://github.com/sphinx-doc/sphinx.git/
|
||||
*/
|
||||
|
||||
it('should score a code module match above a page-title match', function() {
|
||||
eval(loadFixture("titles/searchindex.js"));
|
||||
|
||||
expectedRanking = [
|
||||
['index', 'relevance', '#module-relevance'], /* py:module documentation */
|
||||
['relevance', 'Relevance', ''], /* main title */
|
||||
];
|
||||
|
||||
searchParameters = Search._parseQuery('relevance');
|
||||
results = Search._performSearch(...searchParameters);
|
||||
|
||||
checkRanking(expectedRanking, results);
|
||||
});
|
||||
|
||||
it('should score a main-title match above an object member match', function() {
|
||||
eval(loadFixture("titles/searchindex.js"));
|
||||
|
||||
expectedRanking = [
|
||||
['relevance', 'Relevance', ''], /* main title */
|
||||
['index', 'relevance.Example.relevance', '#module-relevance'], /* py:class attribute */
|
||||
];
|
||||
|
||||
searchParameters = Search._parseQuery('relevance');
|
||||
results = Search._performSearch(...searchParameters);
|
||||
|
||||
checkRanking(expectedRanking, results);
|
||||
});
|
||||
|
||||
it('should score a main-title match above a subheading-title match', function() {
|
||||
eval(loadFixture("titles/searchindex.js"));
|
||||
|
||||
expectedRanking = [
|
||||
['relevance', 'Relevance', ''], /* main title */
|
||||
['index', 'Main Page > Relevance', '#relevance'], /* subsection heading title */
|
||||
];
|
||||
|
||||
searchParameters = Search._parseQuery('relevance');
|
||||
results = Search._performSearch(...searchParameters);
|
||||
|
||||
checkRanking(expectedRanking, results);
|
||||
});
|
||||
|
||||
});
|
||||
|
||||
});
|
||||
|
||||
describe("htmlToText", function() {
|
||||
|
Loading…
Reference in New Issue
Block a user