sphinx/utils/jssplitter_generator.py

143 lines
3.6 KiB
Python
Raw Normal View History

2016-11-15 20:59:04 -06:00
# -*- coding: utf-8 -*-
import re
import json
import subprocess
import sys
import six
# find char codes they are matched with Python's \\w(?u)
match = re.compile(r'\w(?u)')
begin = -1
ranges = []
singles = []
for i in range(65536):
# 0xd800-0xdfff is surrogate pair area. skip this.
if not match.match(six.unichr(i)) and not (0xd800 <= i <= 0xdfff):
if begin == -1:
begin = i
elif begin != -1:
if begin + 1 == i:
singles.append(begin)
else:
ranges.append((begin, i - 1))
begin = -1
# fold json within almost 80 chars per line
def fold(jsonData, splitter):
code = json.dumps(jsonData)
lines = []
while True:
if len(code) < 71:
lines.append(' ' + code)
break
index = code.index(splitter, 70)
lines.append(' ' + code[:index+len(splitter)])
code = code[index+len(splitter):]
lines[0] = lines[0][8:]
return '\n'.join(lines)
# JavaScript code
js_src = '''
var splitChars = (function() {
var result = {};
var singles = %s;
var i, j, start, end;
for (i = 0; i < singles.length; i++) {
result[singles[i]] = true;
}
var ranges = %s;
for (i = 0; i < ranges.length; i++) {
start = ranges[i][0];
end = ranges[i][1];
for (j = start; j <= end; j++) {
result[j] = true;
}
}
return result;
})();
function splitQuery(query) {
var result = [];
var start = -1;
for (var i = 0; i < query.length; i++) {
if (splitChars[query.charCodeAt(i)]) {
if (start !== -1) {
result.push(query.slice(start, i));
start = -1;
}
} else if (start === -1) {
start = i;
}
}
if (start !== -1) {
result.push(query.slice(start));
}
return result;
}
''' % (fold(singles, ','), fold(ranges, '],'))
js_test_src = u'''
// This is regression test for https://github.com/sphinx-doc/sphinx/issues/3150
// generated by compat_regexp_generator.py
// it needs node.js for testing
var assert = require('assert');
%s
console.log("test splitting English words")
assert.deepEqual(['Hello', 'World'], splitQuery(' Hello World '));
console.log(' ... ok\\n')
console.log("test splitting special characters")
assert.deepEqual(['Pin', 'Code'], splitQuery('Pin-Code'));
console.log(' ... ok\\n')
console.log("test splitting Chinese characters")
assert.deepEqual(['Hello', 'from', '中国', '上海'], splitQuery('Hello from 中国 上海'));
console.log(' ... ok\\n')
console.log("test splitting Emoji(surrogate pair) characters. It should keep emojis.")
assert.deepEqual(['😁😁'], splitQuery('😁😁'));
console.log(' ... ok\\n')
console.log("test splitting umlauts. It should keep umlauts.")
assert.deepEqual(
['Löschen', 'Prüfung', 'Abändern', 'ærlig', 'spørsmål'],
splitQuery('Löschen Prüfung Abändern ærlig spørsmål'));
console.log(' ... ok\\n')
''' % js_src
python_src = '''
"""# -*- coding: utf-8 -*-
sphinx.search.jssplitter
~~~~~~~~~~~~~~~~~~~~~~~~
Provides Python compatible word splitter to JavaScript
:copyright: Copyright 2007-2016 by the Sphinx team, see AUTHORS.
:license: BSD, see LICENSE for details.
DO NOT EDIT. This is generated by utils/jssplitter_generator.py
"""
splitter_code = """
%s
"""
''' % js_src
with open('../sphinx/search/jssplitter.py', 'w') as f:
f.write(python_src)
with open('./regression_test.js', 'w') as f:
f.write(js_test_src.encode('utf-8'))
print("starting test...")
result = subprocess.call(['node', './regression_test.js'])
sys.exit(result)