This commit is contained in:
Yoshiki Shibukawa 2016-11-16 11:59:04 +09:00
parent 4583c4b022
commit 873fab53b4
6 changed files with 266 additions and 2 deletions

1
.gitignore vendored
View File

@ -19,3 +19,4 @@ Sphinx.egg-info/
doc/_build/
tests/.coverage
tests/build/
utils/regression_test.js

View File

@ -10,6 +10,7 @@ Bugs fixed
* #3068: Allow the '=' character in the -D option of sphinx-build.py
* #3074: ``add_source_parser()`` crashes in debug mode
* #3135: ``sphinx.ext.autodoc`` crashes with plain Callable
* #3150: Fix query word splitter in JavaScript. It behaves as same as Python's regular expression.
Release 1.4.8 (released Oct 1, 2016)
====================================

View File

@ -17,7 +17,7 @@ from os import path
from sphinx.util import jsdump, rpartition
from sphinx.util.pycompat import htmlescape
from sphinx.search.jssplitter import splitter_code
class SearchLanguage(object):
"""
@ -241,6 +241,7 @@ class IndexBuilder(object):
self.js_scorer_code = fp.read().decode('utf-8')
else:
self.js_scorer_code = u''
self.js_splitter_code = splitter_code
def load(self, stream, format):
"""Reconstruct from frozen data."""
@ -381,6 +382,7 @@ class IndexBuilder(object):
search_language_stemming_code = self.lang.js_stemmer_code,
search_language_stop_words = jsdump.dumps(sorted(self.lang.stopwords)),
search_scorer_tool = self.js_scorer_code,
search_word_splitter_code = self.js_splitter_code,
)
def get_js_stemmer_rawcode(self):

110
sphinx/search/jssplitter.py Normal file
View File

@ -0,0 +1,110 @@
"""# -*- coding: utf-8 -*-
sphinx.search.jssplitter
~~~~~~~~~~~~~~~~~~~~~~~~
Provides Python compatible word splitter to JavaScript
:copyright: Copyright 2007-2016 by the Sphinx team, see AUTHORS.
:license: BSD, see LICENSE for details.
DO NOT EDIT. This is generated by utils/jssplitter_generator.py
"""
splitter_code = """
var splitChars = (function() {
var result = {};
var singles = [96, 180, 187, 191, 215, 247, 749, 885, 903, 907, 909, 930, 1014, 1648,
1748, 1809, 2416, 2473, 2481, 2526, 2601, 2609, 2612, 2615, 2653, 2702,
2706, 2729, 2737, 2740, 2857, 2865, 2868, 2910, 2928, 2948, 2961, 2971,
2973, 3085, 3089, 3113, 3124, 3213, 3217, 3241, 3252, 3295, 3341, 3345,
3369, 3506, 3516, 3633, 3715, 3721, 3736, 3744, 3748, 3750, 3756, 3761,
3781, 3912, 4239, 4347, 4681, 4695, 4697, 4745, 4785, 4799, 4801, 4823,
4881, 5760, 5901, 5997, 6313, 7405, 8024, 8026, 8028, 8030, 8117, 8125,
8133, 8181, 8468, 8485, 8487, 8489, 8494, 8527, 11311, 11359, 11687, 11695,
11703, 11711, 11719, 11727, 11735, 12448, 12539, 43010, 43014, 43019, 43587,
43696, 43713, 64286, 64297, 64311, 64317, 64319, 64322, 64325, 65141];
var i, j, start, end;
for (i = 0; i < singles.length; i++) {
result[singles[i]] = true;
}
var ranges = [[0, 47], [58, 64], [91, 94], [123, 169], [171, 177], [182, 184], [706, 709],
[722, 735], [741, 747], [751, 879], [888, 889], [894, 901], [1154, 1161],
[1318, 1328], [1367, 1368], [1370, 1376], [1416, 1487], [1515, 1519], [1523, 1568],
[1611, 1631], [1642, 1645], [1750, 1764], [1767, 1773], [1789, 1790], [1792, 1807],
[1840, 1868], [1958, 1968], [1970, 1983], [2027, 2035], [2038, 2041], [2043, 2047],
[2070, 2073], [2075, 2083], [2085, 2087], [2089, 2307], [2362, 2364], [2366, 2383],
[2385, 2391], [2402, 2405], [2419, 2424], [2432, 2436], [2445, 2446], [2449, 2450],
[2483, 2485], [2490, 2492], [2494, 2509], [2511, 2523], [2530, 2533], [2546, 2547],
[2554, 2564], [2571, 2574], [2577, 2578], [2618, 2648], [2655, 2661], [2672, 2673],
[2677, 2692], [2746, 2748], [2750, 2767], [2769, 2783], [2786, 2789], [2800, 2820],
[2829, 2830], [2833, 2834], [2874, 2876], [2878, 2907], [2914, 2917], [2930, 2946],
[2955, 2957], [2966, 2968], [2976, 2978], [2981, 2983], [2987, 2989], [3002, 3023],
[3025, 3045], [3059, 3076], [3130, 3132], [3134, 3159], [3162, 3167], [3170, 3173],
[3184, 3191], [3199, 3204], [3258, 3260], [3262, 3293], [3298, 3301], [3312, 3332],
[3386, 3388], [3390, 3423], [3426, 3429], [3446, 3449], [3456, 3460], [3479, 3481],
[3518, 3519], [3527, 3584], [3636, 3647], [3655, 3663], [3674, 3712], [3717, 3718],
[3723, 3724], [3726, 3731], [3752, 3753], [3764, 3772], [3774, 3775], [3783, 3791],
[3802, 3803], [3806, 3839], [3841, 3871], [3892, 3903], [3949, 3975], [3980, 4095],
[4139, 4158], [4170, 4175], [4182, 4185], [4190, 4192], [4194, 4196], [4199, 4205],
[4209, 4212], [4226, 4237], [4250, 4255], [4294, 4303], [4349, 4351], [4686, 4687],
[4702, 4703], [4750, 4751], [4790, 4791], [4806, 4807], [4886, 4887], [4955, 4968],
[4989, 4991], [5008, 5023], [5109, 5120], [5741, 5742], [5787, 5791], [5867, 5869],
[5873, 5887], [5906, 5919], [5938, 5951], [5970, 5983], [6001, 6015], [6068, 6102],
[6104, 6107], [6109, 6111], [6122, 6127], [6138, 6159], [6170, 6175], [6264, 6271],
[6315, 6319], [6390, 6399], [6429, 6469], [6510, 6511], [6517, 6527], [6572, 6592],
[6600, 6607], [6619, 6655], [6679, 6687], [6741, 6783], [6794, 6799], [6810, 6822],
[6824, 6916], [6964, 6980], [6988, 6991], [7002, 7042], [7073, 7085], [7098, 7167],
[7204, 7231], [7242, 7244], [7294, 7400], [7410, 7423], [7616, 7679], [7958, 7959],
[7966, 7967], [8006, 8007], [8014, 8015], [8062, 8063], [8127, 8129], [8141, 8143],
[8148, 8149], [8156, 8159], [8173, 8177], [8189, 8303], [8306, 8307], [8314, 8318],
[8330, 8335], [8341, 8449], [8451, 8454], [8456, 8457], [8470, 8472], [8478, 8483],
[8506, 8507], [8512, 8516], [8522, 8525], [8586, 9311], [9372, 9449], [9472, 10101],
[10132, 11263], [11493, 11498], [11503, 11516], [11518, 11519], [11558, 11567],
[11622, 11630], [11632, 11647], [11671, 11679], [11743, 11822], [11824, 12292],
[12296, 12320], [12330, 12336], [12342, 12343], [12349, 12352], [12439, 12444],
[12544, 12548], [12590, 12592], [12687, 12689], [12694, 12703], [12728, 12783],
[12800, 12831], [12842, 12880], [12896, 12927], [12938, 12976], [12992, 13311],
[19894, 19967], [40908, 40959], [42125, 42191], [42238, 42239], [42509, 42511],
[42540, 42559], [42592, 42593], [42607, 42622], [42648, 42655], [42736, 42774],
[42784, 42785], [42889, 42890], [42893, 43002], [43043, 43055], [43062, 43071],
[43124, 43137], [43188, 43215], [43226, 43249], [43256, 43258], [43260, 43263],
[43302, 43311], [43335, 43359], [43389, 43395], [43443, 43470], [43482, 43519],
[43561, 43583], [43596, 43599], [43610, 43615], [43639, 43641], [43643, 43647],
[43698, 43700], [43703, 43704], [43710, 43711], [43715, 43738], [43742, 43967],
[44003, 44015], [44026, 44031], [55204, 55215], [55239, 55242], [55292, 55295],
[57344, 63743], [64046, 64047], [64110, 64111], [64218, 64255], [64263, 64274],
[64280, 64284], [64434, 64466], [64830, 64847], [64912, 64913], [64968, 65007],
[65020, 65135], [65277, 65295], [65306, 65312], [65339, 65344], [65371, 65381],
[65471, 65473], [65480, 65481], [65488, 65489], [65496, 65497]];
for (i = 0; i < ranges.length; i++) {
start = ranges[i][0];
end = ranges[i][1];
for (j = start; j <= end; j++) {
result[j] = true;
}
}
return result;
})();
function splitQuery(query) {
var result = [];
var start = -1;
for (var i = 0; i < query.length; i++) {
if (splitChars[query.charCodeAt(i)]) {
if (start !== -1) {
result.push(query.slice(start, i));
start = -1;
}
} else if (start === -1) {
start = i;
}
}
if (start !== -1) {
result.push(query.slice(start));
}
return result;
}
"""

View File

@ -47,6 +47,14 @@ var Scorer = {
};
{% endif %}
{% if search_word_splitter_code %}
{{ search_word_splitter_code }}
{% else %}
function splitQuery(query) {
return query.split(/\s+/);
}
{% endif %}
/**
* Search Module
*/
@ -145,7 +153,7 @@ var Search = {
var searchterms = [];
var excluded = [];
var hlterms = [];
var tmp = query.split(/\W+/);
var tmp = splitQuery(query);
var objectterms = [];
for (i = 0; i < tmp.length; i++) {
if (tmp[i] !== "") {

View File

@ -0,0 +1,142 @@
# -*- coding: utf-8 -*-
import re
import json
import subprocess
import sys
import six
# find char codes they are matched with Python's \\w(?u)
match = re.compile(r'\w(?u)')
begin = -1
ranges = []
singles = []
for i in range(65536):
# 0xd800-0xdfff is surrogate pair area. skip this.
if not match.match(six.unichr(i)) and not (0xd800 <= i <= 0xdfff):
if begin == -1:
begin = i
elif begin != -1:
if begin + 1 == i:
singles.append(begin)
else:
ranges.append((begin, i - 1))
begin = -1
# fold json within almost 80 chars per line
def fold(jsonData, splitter):
code = json.dumps(jsonData)
lines = []
while True:
if len(code) < 71:
lines.append(' ' + code)
break
index = code.index(splitter, 70)
lines.append(' ' + code[:index+len(splitter)])
code = code[index+len(splitter):]
lines[0] = lines[0][8:]
return '\n'.join(lines)
# JavaScript code
js_src = '''
var splitChars = (function() {
var result = {};
var singles = %s;
var i, j, start, end;
for (i = 0; i < singles.length; i++) {
result[singles[i]] = true;
}
var ranges = %s;
for (i = 0; i < ranges.length; i++) {
start = ranges[i][0];
end = ranges[i][1];
for (j = start; j <= end; j++) {
result[j] = true;
}
}
return result;
})();
function splitQuery(query) {
var result = [];
var start = -1;
for (var i = 0; i < query.length; i++) {
if (splitChars[query.charCodeAt(i)]) {
if (start !== -1) {
result.push(query.slice(start, i));
start = -1;
}
} else if (start === -1) {
start = i;
}
}
if (start !== -1) {
result.push(query.slice(start));
}
return result;
}
''' % (fold(singles, ','), fold(ranges, '],'))
js_test_src = u'''
// This is regression test for https://github.com/sphinx-doc/sphinx/issues/3150
// generated by compat_regexp_generator.py
// it needs node.js for testing
var assert = require('assert');
%s
console.log("test splitting English words")
assert.deepEqual(['Hello', 'World'], splitQuery(' Hello World '));
console.log(' ... ok\\n')
console.log("test splitting special characters")
assert.deepEqual(['Pin', 'Code'], splitQuery('Pin-Code'));
console.log(' ... ok\\n')
console.log("test splitting Chinese characters")
assert.deepEqual(['Hello', 'from', '中国', '上海'], splitQuery('Hello from 中国 上海'));
console.log(' ... ok\\n')
console.log("test splitting Emoji(surrogate pair) characters. It should keep emojis.")
assert.deepEqual(['😁😁'], splitQuery('😁😁'));
console.log(' ... ok\\n')
console.log("test splitting umlauts. It should keep umlauts.")
assert.deepEqual(
['Löschen', 'Prüfung', 'Abändern', 'ærlig', 'spørsmål'],
splitQuery('Löschen Prüfung Abändern ærlig spørsmål'));
console.log(' ... ok\\n')
''' % js_src
python_src = '''
"""# -*- coding: utf-8 -*-
sphinx.search.jssplitter
~~~~~~~~~~~~~~~~~~~~~~~~
Provides Python compatible word splitter to JavaScript
:copyright: Copyright 2007-2016 by the Sphinx team, see AUTHORS.
:license: BSD, see LICENSE for details.
DO NOT EDIT. This is generated by utils/jssplitter_generator.py
"""
splitter_code = """
%s
"""
''' % js_src
with open('../sphinx/search/jssplitter.py', 'w') as f:
f.write(python_src)
with open('./regression_test.js', 'w') as f:
f.write(js_test_src.encode('utf-8'))
print("starting test...")
result = subprocess.call(['node', './regression_test.js'])
sys.exit(result)