Closes #1067: implement pluggable search scorer and tweak scoring to give good results. Patch by Hernan Grecco.

This commit is contained in:
Georg Brandl
2013-01-04 11:17:32 +01:00
parent d747fa3cd0
commit 1832284e89
6 changed files with 127 additions and 47 deletions

View File

@@ -1,6 +1,10 @@
Release 1.2 (in development)
============================
* #1067: Improve the ordering of the JavaScript search results: matches in titles
come before matches in full text, and object results are better categorized.
Also implement a pluggable search scorer.
* PR#72: #975: Fix gettext does not extract definition terms before docutils 0.10.0
* PR#25: In inheritance diagrams, the first line of the class docstring

View File

@@ -760,6 +760,15 @@ that use Sphinx' HTMLWriter class.
.. versionadded:: 1.1
.. confval:: html_search_scorer
The name of a javascript file (relative to the configuration directory) that
implements a search results scorer. If empty, the default will be used.
.. XXX describe interface for scorer here
.. versionadded:: 1.2
.. confval:: htmlhelp_basename
Output file base name for HTML help builder. Default is ``'pydoc'``.

View File

@@ -240,7 +240,8 @@ class StandaloneHTMLBuilder(Builder):
if not lang or lang not in languages:
lang = 'en'
self.indexer = IndexBuilder(self.env, lang,
self.config.html_search_options)
self.config.html_search_options,
self.config.html_search_scorer)
self.load_indexer(docnames)
self.docwriter = HTMLWriter(self)

View File

@@ -110,6 +110,7 @@ class Config(object):
html_secnumber_suffix = ('. ', 'html'),
html_search_language = (None, 'html'),
html_search_options = ({}, 'html'),
html_search_scorer = ('', None),
# HTML help only options
htmlhelp_basename = (lambda self: make_filename(self.project), None),

View File

@@ -161,7 +161,7 @@ class IndexBuilder(object):
'pickle': pickle
}
def __init__(self, env, lang, options):
def __init__(self, env, lang, options, scoring):
self.env = env
# filename -> title
self._titles = {}
@@ -176,6 +176,12 @@ class IndexBuilder(object):
# add language-specific SearchLanguage instance
self.lang = languages[lang](options)
if scoring:
with open(scoring, 'rb') as fp:
self.js_scorer_code = fp.read().decode('utf-8')
else:
self.js_scorer_code = u''
def load(self, stream, format):
"""Reconstruct from frozen data."""
if isinstance(format, basestring):
@@ -305,4 +311,5 @@ class IndexBuilder(object):
return dict(
search_language_stemming_code = self.lang.js_stemmer_code,
search_language_stop_words = jsdump.dumps(sorted(self.lang.stopwords)),
search_scorer_tool = self.js_scorer_code,
)

View File

@@ -9,34 +9,41 @@
*
*/
{{ search_language_stemming_code|safe }}
{% if search_scorer_tool %}
{{ search_scorer_tool|safe }}
{% else %}
/**
* helper function to return a node containing the
* search summary for a given text. keywords is a list
* of stemmed words, hlwords is the list of normal, unstemmed
* words. the first one is used to find the occurance, the
* latter for highlighting it.
* Simple result scoring code.
*/
var Scorer = {
// Implement the following function to further tweak the score for each result
// The function takes a result array [filename, title, anchor, descr, score]
// and returns the new score.
/*
score: function(result) {
return result[4];
},
*/
jQuery.makeSearchSummary = function(text, keywords, hlwords) {
var textLower = text.toLowerCase();
var start = 0;
$.each(keywords, function() {
var i = textLower.indexOf(this.toLowerCase());
if (i > -1)
start = i;
});
start = Math.max(start - 120, 0);
var excerpt = ((start > 0) ? '...' : '') +
$.trim(text.substr(start, 240)) +
((start + 240 - text.length) ? '...' : '');
var rv = $('<div class="context"></div>').text(excerpt);
$.each(hlwords, function() {
rv = rv.highlightText(this, 'highlighted');
});
return rv;
};
// query matches the full name of an object
objNameMatch: 11,
// or matches in the last dotted part of the object name
objPartialMatch: 6,
// Additive scores depending on the priority of the object
objPrio: {0: 15, // used to be importantResults
1: 5, // used to be objectResults
2: -5}, // used to be unimportantResults
// Used when the priority is not in the mapping.
objPrioDefault: 0,
{{ search_language_stemming_code|safe }}
// query found in title
title: 15,
// query found in terms
term: 5
};
{% endif %}
/**
* Search Module
@@ -184,23 +191,40 @@ var Search = {
}
// lookup as search terms in fulltext
results = results.concat(this.performTermsSearch(searchterms, excluded, terms, 0))
.concat(this.performTermsSearch(searchterms, excluded, titleterms, 20));
results = results.concat(this.performTermsSearch(searchterms, excluded, terms, Scorer.term))
.concat(this.performTermsSearch(searchterms, excluded, titleterms, Scorer.title));
// delete unused variables in order to not waste memory until list is
// retrieved completely
delete filenames, titles, terms, titleterms;
// now sort the regular results by score (in opposite order of appearance,
// since the display function below uses pop() to retrieve items)
// let the scorer override scores with a custom scoring function
if (Scorer.score) {
for (i = 0; i < results.length; i++)
results[i][4] = Scorer.score(results[i]);
}
// now sort the results by score (in opposite order of appearance, since the
// display function below uses pop() to retrieve items) and then
// alphabetically
results.sort(function(a, b) {
var left = a[4];
var right = b[4];
return (left > right) ? 1 : ((left < right) ? -1 : 0);
if (left > right) {
return 1;
} else if (left < right) {
return -1;
} else {
// same score: sort alphabetically
left = a[1].toLowerCase();
right = b[1].toLowerCase();
return (left > right) ? -1 : ((left < right) ? 1 : 0);
}
});
console.info('search results:', results);
Search.lastresults = results.slice(); // a copy
// for debugging
//Search.lastresults = results.slice(); // a copy
//console.info('search results:', Search.lastresults);
// print the results
var resultCount = results.length;
@@ -236,7 +260,7 @@ var Search = {
$.get(DOCUMENTATION_OPTIONS.URL_ROOT + '_sources/' +
item[0] + '.txt', function(data) {
if (data != '') {
listItem.append($.makeSearchSummary(data, searchterms, hlterms));
listItem.append(Search.makeSearchSummary(data, searchterms, hlterms));
Search.output.append(listItem);
}
listItem.slideDown(5, function() {
@@ -281,6 +305,16 @@ var Search = {
for (var name in objects[prefix]) {
var fullname = (prefix ? prefix + '.' : '') + name;
if (fullname.toLowerCase().indexOf(object) > -1) {
var score = 0;
var parts = fullname.split('.');
// check for different match types: exact matches of full name or
// "last name" (i.e. last dotted part)
if (fullname == object || parts[parts.length - 1] == object) {
score += Scorer.objNameMatch;
// matches in last name
} else if (parts[parts.length - 1].indexOf(object) > -1) {
score += Scorer.objPartialMatch;
}
var match = objects[prefix][name];
var objname = objnames[match[1]][2];
var title = titles[match[0]];
@@ -301,20 +335,17 @@ var Search = {
}
}
var descr = objname + _(', in ') + title;
anchor = match[3];
if (anchor == '')
anchor = fullname;
else if (anchor == '-')
anchor = objnames[match[1]][1] + '-' + fullname;
result = [filenames[match[0]], fullname, '#'+anchor, descr, 0];
var score;
switch (match[2]) {
case 1: // normal results -- display between important and fulltext
score = 5; break;
case 0: // "important" results -- show directly after title results
score = 10; break;
case 2: // "unimportant" results -- show after fulltext results
score = -10; break;
// add custom score for some objects according to scorer
if (Scorer.objPrio.hasOwnProperty(match[2])) {
score += Scorer.objPrio[match[2]];
} else {
score += Scorer.objPrioDefault;
}
results.push([filenames[match[0]], fullname, '#'+anchor, descr, score]);
}
@@ -372,11 +403,38 @@ var Search = {
}
// if we have still a valid result we can add it to the result list
if (valid)
if (valid) {
results.push([filenames[file], titles[file], '', null, score]);
}
}
return results;
},
/**
* helper function to return a node containing the
* search summary for a given text. keywords is a list
* of stemmed words, hlwords is the list of normal, unstemmed
* words. the first one is used to find the occurance, the
* latter for highlighting it.
*/
makeSearchSummary : function(text, keywords, hlwords) {
var textLower = text.toLowerCase();
var start = 0;
$.each(keywords, function() {
var i = textLower.indexOf(this.toLowerCase());
if (i > -1)
start = i;
});
start = Math.max(start - 120, 0);
var excerpt = ((start > 0) ? '...' : '') +
$.trim(text.substr(start, 240)) +
((start + 240 - text.length) ? '...' : '');
var rv = $('<div class="context"></div>').text(excerpt);
$.each(hlwords, function() {
rv = rv.highlightText(this, 'highlighted');
});
return rv;
}
};
$(document).ready(function() {