Update splitQuery

2025-02-25 18:55:22 -06:00 · 2022-03-08 01:45:44 +00:00
parent ee298ac47e
commit ac9c2b5571
2 changed files with 66 additions and 90 deletions
--- a/utils/jssplitter_generator.py
+++ b/utils/jssplitter_generator.py
@@ -1,91 +1,71 @@
 import json
-import re
 import subprocess
-import sys

-# find char codes they are matched with Python's (?u)\\w
-
-match = re.compile(r'(?u)\w')
 begin = -1
-
 ranges = []
-singles = []

 for i in range(65536):
-    # 0xd800-0xdfff is surrogate pair area. skip this.
-    if not match.match(chr(i)) and not (0xd800 <= i <= 0xdfff):
+    # Get all non 'word' codepoints. This means skipping all alphanumerics and
+    # '_' (U+0095), matching the `\w` character class in `re`. We also skip
+    # 0xd800-0xdfff, the surrogate pair area.
+    if not (chr(i).isalnum() or i == 95) and not (0xd800 <= i <= 0xdfff):
        if begin == -1:
            begin = i
    elif begin != -1:
-        if begin + 1 == i:
-            singles.append(begin)
-        else:
-            ranges.append((begin, i - 1))
+        ranges.append((begin, i))
        begin = -1


 # fold json within almost 80 chars per line
-def fold(jsonData, splitter):
-    code = json.dumps(jsonData)
+def fold(json_data, splitter):
+    code = json.dumps(json_data)
    lines = []
    while True:
-        if len(code) < 71:
-            lines.append('        ' + code)
+        if len(code) < 75:
+            lines.append('    ' + code)
            break
-        index = code.index(splitter, 70)
-        lines.append('        ' + code[:index + len(splitter)])
+        index = code.index(splitter, 74)
+        lines.append('    ' + code[:index + len(splitter)])
        code = code[index + len(splitter):]
-    lines[0] = lines[0][8:]
+    lines[0] = lines[0][4:]
    return '\n'.join(lines)


 # JavaScript code
-js_src = '''
-var splitChars = (function() {
-    var result = {};
-    var singles = %s;
-    var i, j, start, end;
-    for (i = 0; i < singles.length; i++) {
-        result[singles[i]] = true;
-    }
-    var ranges = %s;
-    for (i = 0; i < ranges.length; i++) {
-        start = ranges[i][0];
-        end = ranges[i][1];
-        for (j = start; j <= end; j++) {
-            result[j] = true;
-        }
-    }
-    return result;
-})();
+js_src = '''\
+const splitChars = new Set(
+    ''' + fold(ranges, "],") + '''.map(
+        ([start, end]) => Array(end - start).fill(0).map((_, i) => start + i)
+    ).flat()
+)

-function splitQuery(query) {
-    var result = [];
-    var start = -1;
-    for (var i = 0; i < query.length; i++) {
-        if (splitChars[query.charCodeAt(i)]) {
-            if (start !== -1) {
+const splitQuery = (query) => {
+    const result = [];
+    let start = null;
+    for (let i = 0; i < query.length; i++) {
+        if (splitChars.has(query.charCodeAt(i))) {
+            if (start !== null) {
                result.push(query.slice(start, i));
-                start = -1;
+                start = null;
+            }
+        } else {
+            if (start === null) start = i;
+            if (i === query.length - 1) {
+                result.push(query.slice(start));
            }
-        } else if (start === -1) {
-            start = i;
        }
    }
-    if (start !== -1) {
-        result.push(query.slice(start));
-    }
    return result;
 }
-''' % (fold(singles, ','), fold(ranges, '],'))
+'''

-js_test_src = '''
+js_test_src = f'''\
 // This is regression test for https://github.com/sphinx-doc/sphinx/issues/3150
 // generated by compat_regexp_generator.py
 // it needs node.js for testing
-var assert = require('assert');
+const assert = require('assert');

-%s
+{js_src}

 console.log("test splitting English words")
 assert.deepEqual(['Hello', 'World'], splitQuery('   Hello    World   '));
@@ -99,7 +79,7 @@ console.log("test splitting Chinese characters")
 assert.deepEqual(['Hello', 'from', '中国', '上海'], splitQuery('Hello from 中国 上海'));
 console.log('   ... ok\\n')

-console.log("test splitting Emoji(surrogate pair) characters. It should keep emojis.")
+console.log("test splitting Emoji (surrogate pair) characters. It should keep emojis.")
 assert.deepEqual(['😁😁'], splitQuery('😁😁'));
 console.log('   ... ok\\n')

@@ -109,9 +89,9 @@ assert.deepEqual(
    splitQuery('Löschen Prüfung Abändern ærlig spørsmål'));
 console.log('   ... ok\\n')

-''' % js_src
+'''

-python_src = '''\
+python_src = f'''\
 """
    sphinx.search.jssplitter
    ~~~~~~~~~~~~~~~~~~~~~~~~
@@ -120,21 +100,20 @@ python_src = '''\

    DO NOT EDIT. This is generated by utils/jssplitter_generator.py

-    :copyright: Copyright 2007-2019 by the Sphinx team, see AUTHORS.
+    :copyright: Copyright 2007-2022 by the Sphinx team, see AUTHORS.
    :license: BSD, see LICENSE for details.
 """

 splitter_code = """
-%s
+{js_src}
 """
-''' % js_src
+'''

-with open('../sphinx/search/jssplitter.py', 'w') as f:
+with open('../sphinx/search/jssplitter.py', 'w', encoding="utf-8") as f:
    f.write(python_src)

-with open('./regression_test.js', 'w') as f:
+with open('./regression_test.js', 'w', encoding="utf-8") as f:
    f.write(js_test_src)

 print("starting test...")
-result = subprocess.call(['node', './regression_test.js'])
-sys.exit(result)
+raise SystemExit(subprocess.call(['node', './regression_test.js']))