Improve relevance scoring in HTML search results (#12441)

Co-authored-by: Will Lachance <wlach@protonmail.com>
Co-authored-by: Bénédikt Tran <10796600+picnixz@users.noreply.github.com>
This commit is contained in:
James Addison 2024-07-11 11:55:40 +01:00 committed by GitHub
parent e7beb8bc5c
commit 91c5cd3abd
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
8 changed files with 132 additions and 3 deletions

View File

@ -112,6 +112,10 @@ Bugs fixed
* #12425: Use Docutils' SVG processing in the HTML builder
and remove Sphinx's custom logic.
Patch by Tunç Başar Köse.
* #12391: Adjust scoring of matches during HTML search so that document main
titles tend to rank higher than subsection titles. In addition, boost matches
on the name of programming domain objects relative to title/subtitle matches.
Patch by James Addison and Will Lachance.
Testing
-------

View File

@ -328,13 +328,14 @@ const Search = {
for (const [title, foundTitles] of Object.entries(allTitles)) {
if (title.toLowerCase().trim().includes(queryLower) && (queryLower.length >= title.length/2)) {
for (const [file, id] of foundTitles) {
let score = Math.round(100 * queryLower.length / title.length)
const score = Math.round(Scorer.title * queryLower.length / title.length);
const boost = titles[file] === title ? 1 : 0; // add a boost for document titles
normalResults.push([
docNames[file],
titles[file] !== title ? `${titles[file]} > ${title}` : title,
id !== null ? "#" + id : "",
null,
score,
score + boost,
filenames[file],
]);
}

View File

@ -0,0 +1 @@
Search.setIndex({"alltitles": {"Main Page": [[0, null]], "Relevance": [[0, "relevance"], [1, null]]}, "docnames": ["index", "relevance"], "envversion": {"sphinx": 61, "sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2}, "filenames": ["index.rst", "relevance.rst"], "indexentries": {"example (class in relevance)": [[0, "relevance.Example", false]], "module": [[0, "module-relevance", false]], "relevance": [[0, "module-relevance", false]], "relevance (relevance.example attribute)": [[0, "relevance.Example.relevance", false]]}, "objects": {"": [[0, 0, 0, "-", "relevance"]], "relevance": [[0, 1, 1, "", "Example"]], "relevance.Example": [[0, 2, 1, "", "relevance"]]}, "objnames": {"0": ["py", "module", "Python module"], "1": ["py", "class", "Python class"], "2": ["py", "attribute", "Python attribute"]}, "objtypes": {"0": "py:module", "1": "py:class", "2": "py:attribute"}, "terms": {"": [0, 1], "A": 1, "For": 1, "In": [0, 1], "against": 0, "also": 1, "an": 0, "answer": 0, "appear": 1, "ar": 1, "area": 0, "ask": 0, "attribut": 0, "built": 1, "can": [0, 1], "class": 0, "code": [0, 1], "consid": 1, "contain": 0, "context": 0, "corpu": 1, "could": 1, "demonstr": 0, "describ": 1, "detail": 1, "determin": 1, "docstr": 0, "document": [0, 1], "domain": 1, "engin": 0, "exampl": [0, 1], "extract": 0, "find": 0, "found": 0, "from": 0, "function": 1, "ha": 1, "handl": 0, "happen": 1, "head": 0, "help": 0, "highli": 1, "how": 0, "i": [0, 1], "improv": 0, "inform": 0, "intend": 0, "issu": 1, "itself": 1, "knowledg": 0, "languag": 1, "less": 1, "like": [0, 1], "match": 0, "mention": 1, "name": [0, 1], "object": 0, "one": 1, "onli": 1, "other": 0, "page": 1, "part": 1, "particular": 0, "printf": 1, "program": 1, "project": 0, "queri": [0, 1], "question": 0, "re": 0, "rel": 0, "research": 0, "result": 1, "sai": 0, "same": 1, "score": 0, "search": [0, 1], "seem": 0, "softwar": 1, "some": 1, "sphinx": 0, "straightforward": 1, "subject": 0, "subsect": 0, "term": [0, 1], "test": 0, "text": 0, "than": 1, "thei": 0, "them": 0, "thi": 0, "titl": 0, "user": [0, 1], "we": [0, 1], "when": 0, "whether": 1, "within": 0, "would": 1}, "titles": ["Main Page", "Relevance"], "titleterms": {"main": 0, "page": 0, "relev": [0, 1]}})

View File

@ -0,0 +1,6 @@
import os
import sys
sys.path.insert(0, os.path.abspath('.'))
extensions = ['sphinx.ext.autodoc']

View File

@ -0,0 +1,20 @@
Main Page
=========
This is the main page of the ``titles`` test project.
In particular, this test project is intended to demonstrate how Sphinx
can handle scoring of query matches against document titles and subsection
heading titles relative to other document matches such as terms found within
document text and object names extracted from code.
Relevance
---------
In the context of search engines, we can say that a document is **relevant**
to a user's query when it contains information that seems likely to help them
find an answer to a question they're asking, or to improve their knowledge of
the subject area they're researching.
.. automodule:: relevance
:members:

View File

@ -0,0 +1,7 @@
class Example:
"""Example class"""
num_attribute = 5
text_attribute = "string"
relevance = "testing"
"""attribute docstring"""

View File

@ -0,0 +1,13 @@
Relevance
=========
In some domains, it can be straightforward to determine whether a search result
is relevant to the user's query.
For example, if we are in a software programming language domain, and a user
has issued a query for the term ``printf``, then we could consider a document
in the corpus that describes a built-in language function with the same name
as (highly) relevant. A document that only happens to mention the ``printf``
function name as part of some example code that appears on the page would
also be relevant, but likely less relevant than the one that describes the
function itself in detail.

View File

@ -7,6 +7,23 @@ describe('Basic html theme search', function() {
return req.responseText;
}
function checkRanking(expectedRanking, results) {
let [nextExpected, ...remainingItems] = expectedRanking;
for (result of results.reverse()) {
if (!nextExpected) break;
let [expectedPage, expectedTitle, expectedTarget] = nextExpected;
let [page, title, target] = result;
if (page == expectedPage && title == expectedTitle && target == expectedTarget) {
[nextExpected, ...remainingItems] = remainingItems;
}
}
expect(remainingItems.length).toEqual(0);
}
describe('terms search', function() {
it('should find "C++" when in index', function() {
@ -76,7 +93,7 @@ describe('Basic html theme search', function() {
'Main Page',
'',
null,
100,
16,
'index.rst'
]
];
@ -85,6 +102,66 @@ describe('Basic html theme search', function() {
});
describe('search result ranking', function() {
/*
* These tests should not proscribe precise expected ordering of search
* results; instead each test case should describe a single relevance rule
* that helps users to locate relevant information efficiently.
*
* If you think that one of the rules seems to be poorly-defined or is
* limiting the potential for search algorithm improvements, please check
* for existing discussion/bugreports related to it on GitHub[1] before
* creating one yourself. Suggestions for possible improvements are also
* welcome.
*
* [1] - https://github.com/sphinx-doc/sphinx.git/
*/
it('should score a code module match above a page-title match', function() {
eval(loadFixture("titles/searchindex.js"));
expectedRanking = [
['index', 'relevance', '#module-relevance'], /* py:module documentation */
['relevance', 'Relevance', ''], /* main title */
];
searchParameters = Search._parseQuery('relevance');
results = Search._performSearch(...searchParameters);
checkRanking(expectedRanking, results);
});
it('should score a main-title match above an object member match', function() {
eval(loadFixture("titles/searchindex.js"));
expectedRanking = [
['relevance', 'Relevance', ''], /* main title */
['index', 'relevance.Example.relevance', '#module-relevance'], /* py:class attribute */
];
searchParameters = Search._parseQuery('relevance');
results = Search._performSearch(...searchParameters);
checkRanking(expectedRanking, results);
});
it('should score a main-title match above a subheading-title match', function() {
eval(loadFixture("titles/searchindex.js"));
expectedRanking = [
['relevance', 'Relevance', ''], /* main title */
['index', 'Main Page > Relevance', '#relevance'], /* subsection heading title */
];
searchParameters = Search._parseQuery('relevance');
results = Search._performSearch(...searchParameters);
checkRanking(expectedRanking, results);
});
});
});
describe("htmlToText", function() {