Merge pull request #12141 from vigoux/treesitter_fix

[RFC] Update treesitter runtime Get rid of our utf8proc dependency Note that we unconditionnally escape treesitter queries, might need to be revisited.
2025-02-25 18:55:25 -06:00 · 2020-04-18 16:05:16 +02:00 · 2020-04-18 16:05:16 +02:00 · 9ac5bc4b0b
commit 9ac5bc4b0b
parent e5da4fd557 e10f9151dc
11 changed files with 304 additions and 263 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -374,13 +374,6 @@ include_directories(SYSTEM ${MSGPACK_INCLUDE_DIRS})
 find_package(LibLUV 1.30.0 REQUIRED)
 include_directories(SYSTEM ${LIBLUV_INCLUDE_DIRS})

-find_package(UTF8PROC REQUIRED)
-include_directories(SYSTEM ${UTF8PROC_INCLUDE_DIRS})
-if(WIN32)
-  add_definitions(-DUTF8PROC_STATIC)
-endif()
-
-
 # Note: The test lib requires LuaJIT; it will be skipped if LuaJIT is missing.
 option(PREFER_LUA "Prefer Lua over LuaJIT in the nvim executable." OFF)

--- a/cmake/FindUTF8PROC.cmake
+++ b/cmake/FindUTF8PROC.cmake
@ -1,16 +0,0 @@
-# - Try to find utf8proc
-# Once done this will define
-#  UTF8PROC_FOUND - System has utf8proc
-#  UTF8PROC_INCLUDE_DIRS - The utf8proc include directories
-#  UTF8PROC_LIBRARIES - The libraries needed to use utf8proc
-
-include(LibFindMacros)
-
-set(UTF8PROC_NAMES utf8proc)
-if(MSVC)
-  # "utf8proc_static" is used for MSVC (when built statically from third-party).
-  # https://github.com/JuliaStrings/utf8proc/commit/0975bf9b6.
-  list(APPEND UTF8PROC_NAMES utf8proc_static)
-endif()
-libfind_pkg_detect(UTF8PROC utf8proc FIND_PATH utf8proc.h FIND_LIBRARY ${UTF8PROC_NAMES})
-libfind_process(UTF8PROC REQUIRED)
--- a/runtime/lua/vim/treesitter.lua
+++ b/runtime/lua/vim/treesitter.lua
@ -124,7 +124,7 @@ end
 function M.parse_query(lang, query)
  M.require_language(lang)
  local self = setmetatable({}, Query)
-  self.query = vim._ts_parse_query(lang, query)
+  self.query = vim._ts_parse_query(lang, vim.fn.escape(query,'\\'))
  self.info = self.query:inspect()
  self.captures = self.info.captures
  self.regexes = {}
--- a/src/tree_sitter/language.c
+++ b/src/tree_sitter/language.c
@ -72,8 +72,10 @@ const char *ts_language_symbol_name(
    return "ERROR";
  } else if (symbol == ts_builtin_sym_error_repeat) {
    return "_ERROR";
-  } else {
+  } else if (symbol < ts_language_symbol_count(self)) {
    return self->symbol_names[symbol];
+  } else {
+    return NULL;
  }
 }

@ -119,7 +121,7 @@ const char *ts_language_field_name_for_id(
  TSFieldId id
 ) {
  uint32_t count = ts_language_field_count(self);
-  if (count) {
+  if (count && id <= count) {
    return self->field_names[id];
  } else {
    return NULL;
--- a/src/tree_sitter/node.c
+++ b/src/tree_sitter/node.c
@ -150,7 +150,9 @@ static inline TSNode ts_node__child(
    while (ts_node_child_iterator_next(&iterator, &child)) {
      if (ts_node__is_relevant(child, include_anonymous)) {
        if (index == child_index) {
-          ts_tree_set_cached_parent(self.tree, &child, &self);
+          if (ts_node__is_relevant(self, true)) {
+            ts_tree_set_cached_parent(self.tree, &child, &self);
+          }
          return child;
        }
        index++;
--- a/src/tree_sitter/parser.c
+++ b/src/tree_sitter/parser.c
@ -324,6 +324,12 @@ static bool ts_parser__can_reuse_first_leaf(
  TSStateId leaf_state = ts_subtree_leaf_parse_state(tree);
  TSLexMode leaf_lex_mode = self->language->lex_modes[leaf_state];

+  // At the end of a non-terminal extra node, the lexer normally returns
+  // NULL, which indicates that the parser should look for a reduce action
+  // at symbol `0`. Avoid reusing tokens in this situation to ensure that
+  // the same thing happens when incrementally reparsing.
+  if (current_lex_mode.lex_state == (uint16_t)(-1)) return false;
+
  // If the token was created in a state with the same set of lookaheads, it is reusable.
  if (
    table_entry->action_count > 0 &&
@ -593,6 +599,10 @@ static Subtree ts_parser__reuse_node(
    uint32_t byte_offset = reusable_node_byte_offset(&self->reusable_node);
    uint32_t end_byte_offset = byte_offset + ts_subtree_total_bytes(result);

+    // Do not reuse an EOF node if the included ranges array has changes
+    // later on in the file.
+    if (ts_subtree_is_eof(result)) end_byte_offset = UINT32_MAX;
+
    if (byte_offset > position) {
      LOG("before_reusable_node symbol:%s", TREE_NAME(result));
      break;
@ -1605,8 +1615,8 @@ static unsigned ts_parser__condense_stack(TSParser *self) {

 static bool ts_parser_has_outstanding_parse(TSParser *self) {
  return (
-    self->lexer.current_position.bytes > 0 ||
-    ts_stack_state(self->stack, 0) != 1
+    ts_stack_state(self->stack, 0) != 1 ||
+    ts_stack_node_count_since_error(self->stack, 0) != 0
  );
 }

--- a/src/tree_sitter/query.c
+++ b/src/tree_sitter/query.c
@ -35,15 +35,21 @@ typedef struct {
 *    captured in this pattern.
 * - `depth` - The depth where this node occurs in the pattern. The root node
 *    of the pattern has depth zero.
+ * - `repeat_step_index` - If this step is part of a repetition, the index of
+ *    the beginning of the repetition. A `NONE` value means this step is not
+ *    part of a repetition.
 */
 typedef struct {
  TSSymbol symbol;
  TSFieldId field;
  uint16_t capture_ids[MAX_STEP_CAPTURE_COUNT];
-  uint16_t depth: 13;
+  uint16_t repeat_step_index;
+  uint16_t depth: 11;
  bool contains_captures: 1;
+  bool is_pattern_start: 1;
  bool is_immediate: 1;
  bool is_last: 1;
+  bool is_repeated: 1;
 } QueryStep;

 /*
@ -85,23 +91,27 @@ typedef struct {
 * represented as one of these states.
 */
 typedef struct {
+  uint32_t id;
  uint16_t start_depth;
  uint16_t pattern_index;
  uint16_t step_index;
-  uint16_t capture_count;
-  uint16_t capture_list_id;
  uint16_t consumed_capture_count;
-  uint32_t id;
+  uint16_t repeat_match_count;
+  uint16_t step_index_on_failure;
+  uint8_t capture_list_id;
+  bool seeking_non_match;
 } QueryState;

+typedef Array(TSQueryCapture) CaptureList;
+
 /*
 * CaptureListPool - A collection of *lists* of captures. Each QueryState
- * needs to maintain its own list of captures. They are all represented as
- * slices of one shared array. The CaptureListPool keeps track of which
- * parts of the shared array are currently in use by a QueryState.
+ * needs to maintain its own list of captures. To avoid repeated allocations,
+ * the reuses a fixed set of capture lists, and keeps track of which ones
+ * are currently in use.
 */
 typedef struct {
-  Array(TSQueryCapture) list;
+  CaptureList list[32];
  uint32_t usage_map;
 } CaptureListPool;

@ -119,7 +129,6 @@ struct TSQuery {
  Array(Slice) predicates_by_pattern;
  Array(uint32_t) start_bytes_by_pattern;
  const TSLanguage *language;
-  uint16_t max_capture_count;
  uint16_t wildcard_root_pattern_count;
  TSSymbol *symbol_map;
 };
@ -233,24 +242,25 @@ static void stream_scan_identifier(Stream *stream) {

 static CaptureListPool capture_list_pool_new(void) {
  return (CaptureListPool) {
-    .list = array_new(),
    .usage_map = UINT32_MAX,
  };
 }

-static void capture_list_pool_reset(CaptureListPool *self, uint16_t list_size) {
+static void capture_list_pool_reset(CaptureListPool *self) {
  self->usage_map = UINT32_MAX;
-  uint32_t total_size = MAX_STATE_COUNT * list_size;
-  array_reserve(&self->list, total_size);
-  self->list.size = total_size;
+  for (unsigned i = 0; i < 32; i++) {
+    array_clear(&self->list[i]);
+  }
 }

 static void capture_list_pool_delete(CaptureListPool *self) {
-  array_delete(&self->list);
+  for (unsigned i = 0; i < 32; i++) {
+    array_delete(&self->list[i]);
+  }
 }

-static TSQueryCapture *capture_list_pool_get(CaptureListPool *self, uint16_t id) {
-  return &self->list.contents[id * (self->list.size / MAX_STATE_COUNT)];
+static CaptureList *capture_list_pool_get(CaptureListPool *self, uint16_t id) {
+  return &self->list[id];
 }

 static bool capture_list_pool_is_empty(const CaptureListPool *self) {
@ -269,6 +279,7 @@ static uint16_t capture_list_pool_acquire(CaptureListPool *self) {
 }

 static void capture_list_pool_release(CaptureListPool *self, uint16_t id) {
+  array_clear(&self->list[id]);
  self->usage_map |= bitmask_for_index(id);
 }

@ -331,6 +342,67 @@ static uint16_t symbol_table_insert_name(
  return self->slices.size - 1;
 }

+static uint16_t symbol_table_insert_name_with_escapes(
+  SymbolTable *self,
+  const char *escaped_name,
+  uint32_t escaped_length
+) {
+  Slice slice = {
+    .offset = self->characters.size,
+    .length = 0,
+  };
+  array_grow_by(&self->characters, escaped_length + 1);
+
+  // Copy the contents of the literal into the characters buffer, processing escape
+  // sequences like \n and \". This needs to be done before checking if the literal
+  // is already present, in order to do the string comparison.
+  bool is_escaped = false;
+  for (unsigned i = 0; i < escaped_length; i++) {
+    const char *src = &escaped_name[i];
+    char *dest = &self->characters.contents[slice.offset + slice.length];
+    if (is_escaped) {
+      switch (*src) {
+        case 'n':
+          *dest = '\n';
+          break;
+        case 'r':
+          *dest = '\r';
+          break;
+        case 't':
+          *dest = '\t';
+          break;
+        case '0':
+          *dest = '\0';
+          break;
+        default:
+          *dest = *src;
+          break;
+      }
+      is_escaped = false;
+      slice.length++;
+    } else {
+      if (*src == '\\') {
+        is_escaped = true;
+      } else {
+        *dest = *src;
+        slice.length++;
+      }
+    }
+  }
+
+  // If the string is already present, remove the redundant content from the characters
+  // buffer and return the existing id.
+  int id = symbol_table_id_for_name(self, &self->characters.contents[slice.offset], slice.length);
+  if (id >= 0) {
+    self->characters.size -= (escaped_length + 1);
+    return id;
+  }
+
+  self->characters.contents[slice.offset + slice.length] = 0;
+  array_push(&self->slices, slice);
+  return self->slices.size - 1;
+}
+
 /************
 * QueryStep
 ************/
@ -346,7 +418,11 @@ static QueryStep query_step__new(
    .field = 0,
    .capture_ids = {NONE, NONE, NONE, NONE},
    .contains_captures = false,
+    .is_repeated = false,
+    .is_last = false,
+    .is_pattern_start = false,
    .is_immediate = is_immediate,
+    .repeat_step_index = NONE,
  };
 }

@ -523,9 +599,22 @@ static TSQueryError ts_query__parse_predicate(
      stream_advance(stream);

      // Parse the string content
+      bool is_escaped = false;
      const char *string_content = stream->input;
-      while (stream->next != '"') {
-        if (stream->next == '\n' || !stream_advance(stream)) {
+      for (;;) {
+        if (is_escaped) {
+          is_escaped = false;
+        } else {
+          if (stream->next == '\\') {
+            is_escaped = true;
+          } else if (stream->next == '"') {
+            break;
+          } else if (stream->next == '\n') {
+            stream_reset(stream, string_content - 1);
+            return TSQueryErrorSyntax;
+          }
+        }
+        if (!stream_advance(stream)) {
          stream_reset(stream, string_content - 1);
          return TSQueryErrorSyntax;
        }
@ -533,7 +622,7 @@ static TSQueryError ts_query__parse_predicate(
      uint32_t length = stream->input - string_content;

      // Add a step for the node
-      uint16_t id = symbol_table_insert_name(
+      uint16_t id = symbol_table_insert_name_with_escapes(
        &self->predicate_values,
        string_content,
        length
@ -624,7 +713,7 @@ static TSQueryError ts_query__parse_pattern(

    // Parse the wildcard symbol
    if (stream->next == '*') {
-      symbol = NAMED_WILDCARD_SYMBOL;
+      symbol = depth > 0 ? NAMED_WILDCARD_SYMBOL : WILDCARD_SYMBOL;
      stream_advance(stream);
    }

@ -768,27 +857,43 @@ static TSQueryError ts_query__parse_pattern(

  stream_skip_whitespace(stream);

-  // Parse an '@'-prefixed capture pattern
-  while (stream->next == '@') {
-    stream_advance(stream);
-
-    // Parse the capture name
-    if (!stream_is_ident_start(stream)) return TSQueryErrorSyntax;
-    const char *capture_name = stream->input;
-    stream_scan_identifier(stream);
-    uint32_t length = stream->input - capture_name;
-
-    // Add the capture id to the first step of the pattern
-    uint16_t capture_id = symbol_table_insert_name(
-      &self->captures,
-      capture_name,
-      length
-    );
+  // Parse suffixes modifiers for this pattern
+  for (;;) {
    QueryStep *step = &self->steps.contents[starting_step_index];
-    query_step__add_capture(step, capture_id);
-    (*capture_count)++;

-    stream_skip_whitespace(stream);
+    if (stream->next == '+') {
+      stream_advance(stream);
+      step->is_repeated = true;
+      array_back(&self->steps)->repeat_step_index = starting_step_index;
+      stream_skip_whitespace(stream);
+    }
+
+    // Parse an '@'-prefixed capture pattern
+    else if (stream->next == '@') {
+      stream_advance(stream);
+
+      // Parse the capture name
+      if (!stream_is_ident_start(stream)) return TSQueryErrorSyntax;
+      const char *capture_name = stream->input;
+      stream_scan_identifier(stream);
+      uint32_t length = stream->input - capture_name;
+
+      // Add the capture id to the first step of the pattern
+      uint16_t capture_id = symbol_table_insert_name(
+        &self->captures,
+        capture_name,
+        length
+      );
+      query_step__add_capture(step, capture_id);
+      (*capture_count)++;
+
+      stream_skip_whitespace(stream);
+    }
+
+    // No more suffix modifiers
+    else {
+      break;
+    }
  }

  return 0;
@ -838,16 +943,14 @@ TSQuery *ts_query_new(
    .predicates_by_pattern = array_new(),
    .symbol_map = symbol_map,
    .wildcard_root_pattern_count = 0,
-    .max_capture_count = 0,
    .language = language,
  };

  // Parse all of the S-expressions in the given string.
  Stream stream = stream_new(source, source_len);
  stream_skip_whitespace(&stream);
-  uint32_t start_step_index;
  while (stream.input < stream.end) {
-    start_step_index = self->steps.size;
+    uint32_t start_step_index = self->steps.size;
    uint32_t capture_count = 0;
    array_push(&self->start_bytes_by_pattern, stream.input - source);
    array_push(&self->predicates_by_pattern, ((Slice) {
@ -865,7 +968,19 @@ TSQuery *ts_query_new(
      return NULL;
    }

+    // If a pattern has a wildcard at its root, optimize the matching process
+    // by skipping matching the wildcard.
+    if (
+      self->steps.contents[start_step_index].symbol == WILDCARD_SYMBOL
+    ) {
+      QueryStep *second_step = &self->steps.contents[start_step_index + 1];
+      if (second_step->symbol != WILDCARD_SYMBOL && second_step->depth != PATTERN_DONE_MARKER) {
+        start_step_index += 1;
+      }
+    }
+
    // Maintain a map that can look up patterns for a given root symbol.
+    self->steps.contents[start_step_index].is_pattern_start = true;
    ts_query__pattern_map_insert(
      self,
      self->steps.contents[start_step_index].symbol,
@ -874,13 +989,6 @@ TSQuery *ts_query_new(
    if (self->steps.contents[start_step_index].symbol == WILDCARD_SYMBOL) {
      self->wildcard_root_pattern_count++;
    }
-
-    // Keep track of the maximum number of captures in pattern, because
-    // that numer determines how much space is needed to store each capture
-    // list.
-    if (capture_count > self->max_capture_count) {
-      self->max_capture_count = capture_count;
-    }
  }

  ts_query__finalize_steps(self);
@ -1015,7 +1123,7 @@ void ts_query_cursor_exec(
  array_clear(&self->states);
  array_clear(&self->finished_states);
  ts_tree_cursor_reset(&self->cursor, node);
-  capture_list_pool_reset(&self->capture_list_pool, query->max_capture_count);
+  capture_list_pool_reset(&self->capture_list_pool);
  self->next_state_id = 0;
  self->depth = 0;
  self->ascending = false;
@ -1059,12 +1167,12 @@ static bool ts_query_cursor__first_in_progress_capture(
  bool result = false;
  for (unsigned i = 0; i < self->states.size; i++) {
    const QueryState *state = &self->states.contents[i];
-    if (state->capture_count > 0) {
-      const TSQueryCapture *captures = capture_list_pool_get(
-        &self->capture_list_pool,
-        state->capture_list_id
-      );
-      uint32_t capture_byte = ts_node_start_byte(captures[0].node);
+    const CaptureList *captures = capture_list_pool_get(
+      &self->capture_list_pool,
+      state->capture_list_id
+    );
+    if (captures->size > 0) {
+      uint32_t capture_byte = ts_node_start_byte(captures->contents[0].node);
      if (
        !result ||
        capture_byte < *byte_offset ||
@ -1087,6 +1195,19 @@ static bool ts_query__cursor_add_state(
  TSQueryCursor *self,
  const PatternEntry *pattern
 ) {
+  QueryStep *step = &self->query->steps.contents[pattern->step_index];
+
+  // If this pattern begins with a repetition, then avoid creating
+  // new states after already matching the repetition one or more times.
+  // The query should only one match for the repetition - the one that
+  // started the earliest.
+  if (step->is_repeated) {
+    for (unsigned i = 0; i < self->states.size; i++) {
+      QueryState *state = &self->states.contents[i];
+      if (state->step_index == pattern->step_index) return true;
+    }
+  }
+
  uint32_t list_id = capture_list_pool_acquire(&self->capture_list_pool);

  // If there are no capture lists left in the pool, then terminate whichever
@ -1112,14 +1233,20 @@ static bool ts_query__cursor_add_state(
    }
  }

-  LOG("  start state. pattern:%u\n", pattern->pattern_index);
+  LOG(
+    "  start state. pattern:%u, step:%u\n",
+    pattern->pattern_index,
+    pattern->step_index
+  );
  array_push(&self->states, ((QueryState) {
    .capture_list_id = list_id,
    .step_index = pattern->step_index,
    .pattern_index = pattern->pattern_index,
-    .start_depth = self->depth,
-    .capture_count = 0,
+    .start_depth = self->depth - step->depth,
    .consumed_capture_count = 0,
+    .repeat_match_count = 0,
+    .step_index_on_failure = NONE,
+    .seeking_non_match = false,
  }));
  return true;
 }
@ -1133,15 +1260,15 @@ static QueryState *ts_query__cursor_copy_state(
  array_push(&self->states, *state);
  QueryState *new_state = array_back(&self->states);
  new_state->capture_list_id = new_list_id;
-  TSQueryCapture *old_captures = capture_list_pool_get(
+  CaptureList *old_captures = capture_list_pool_get(
    &self->capture_list_pool,
    state->capture_list_id
  );
-  TSQueryCapture *new_captures = capture_list_pool_get(
+  CaptureList *new_captures = capture_list_pool_get(
    &self->capture_list_pool,
    new_list_id
  );
-  memcpy(new_captures, old_captures, state->capture_count * sizeof(TSQueryCapture));
+  array_push_all(new_captures, old_captures);
  return new_state;
 }

@ -1298,6 +1425,24 @@ static inline bool ts_query_cursor__advance(TSQueryCursor *self) {
        }

        if (!node_does_match) {
+          // If this QueryState has processed a repeating sequence, and that repeating
+          // sequence has ended, move on to the *next* step of this state's pattern.
+          if (
+            state->step_index_on_failure != NONE &&
+            (!later_sibling_can_match || step->is_repeated)
+          ) {
+            LOG(
+              "  finish repetition state. pattern:%u, step:%u\n",
+              state->pattern_index,
+              state->step_index
+            );
+            state->step_index = state->step_index_on_failure;
+            state->step_index_on_failure = NONE;
+            state->repeat_match_count = 0;
+            i--;
+            continue;
+          }
+
          if (!later_sibling_can_match) {
            LOG(
              "  discard state. pattern:%u, step:%u\n",
@ -1312,9 +1457,17 @@ static inline bool ts_query_cursor__advance(TSQueryCursor *self) {
            i--;
            n--;
          }
+
+          state->seeking_non_match = false;
          continue;
        }

+        // The `seeking_non_match` flag indicates that a previous QueryState
+        // has already begun processing this repeating sequence, so that *this*
+        // QueryState should not begin matching until a separate repeating sequence
+        // is found.
+        if (state->seeking_non_match) continue;
+
        // Some patterns can match their root node in multiple ways,
        // capturing different children. If this pattern step could match
        // later children within the same parent, then this query state
@ -1324,11 +1477,20 @@ static inline bool ts_query_cursor__advance(TSQueryCursor *self) {
        // siblings.
        QueryState *next_state = state;
        if (
-          step->depth > 0 &&
+          !step->is_pattern_start &&
          step->contains_captures &&
-          later_sibling_can_match
+          later_sibling_can_match &&
+          state->repeat_match_count == 0
        ) {
          QueryState *copy = ts_query__cursor_copy_state(self, state);
+
+          // The QueryState that matched this node has begun matching a repeating
+          // sequence. The QueryState that *skipped* this node should not start
+          // matching later elements of the same repeating sequence.
+          if (step->is_repeated) {
+            state->seeking_non_match = true;
+          }
+
          if (copy) {
            LOG(
              "  split state. pattern:%u, step:%u\n",
@ -1337,55 +1499,71 @@ static inline bool ts_query_cursor__advance(TSQueryCursor *self) {
            );
            next_state = copy;
          } else {
-            LOG("  canot split state.\n");
+            LOG("  cannot split state.\n");
          }
        }

-        LOG(
-          "  advance state. pattern:%u, step:%u\n",
-          next_state->pattern_index,
-          next_state->step_index
-        );
-
        // If the current node is captured in this pattern, add it to the
        // capture list.
        for (unsigned j = 0; j < MAX_STEP_CAPTURE_COUNT; j++) {
          uint16_t capture_id = step->capture_ids[j];
          if (step->capture_ids[j] == NONE) break;
-          LOG(
-            "  capture node. pattern:%u, capture_id:%u\n",
-            next_state->pattern_index,
-            capture_id
-          );
-          TSQueryCapture *capture_list = capture_list_pool_get(
+          CaptureList *capture_list = capture_list_pool_get(
            &self->capture_list_pool,
            next_state->capture_list_id
          );
-          capture_list[next_state->capture_count++] = (TSQueryCapture) {
+          array_push(capture_list, ((TSQueryCapture) {
            node,
            capture_id
-          };
+          }));
+          LOG(
+            "  capture node. pattern:%u, capture_id:%u, capture_count:%u\n",
+            next_state->pattern_index,
+            capture_id,
+            capture_list->size
+          );
        }

-        // If the pattern is now done, then remove it from the list of
-        // in-progress states, and add it to the list of finished states.
-        next_state->step_index++;
-        QueryStep *next_step = step + 1;
-        if (next_step->depth == PATTERN_DONE_MARKER) {
-          LOG("  finish pattern %u\n", next_state->pattern_index);
+        // If this is the end of a repetition, then jump back to the beginning
+        // of that repetition.
+        if (step->repeat_step_index != NONE) {
+          next_state->step_index_on_failure = next_state->step_index + 1;
+          next_state->step_index = step->repeat_step_index;
+          next_state->repeat_match_count++;
+          LOG(
+            "  continue repeat. pattern:%u, match_count:%u\n",
+            next_state->pattern_index,
+            next_state->repeat_match_count
+          );
+        } else {
+          next_state->step_index++;
+          LOG(
+            "  advance state. pattern:%u, step:%u\n",
+            next_state->pattern_index,
+            next_state->step_index
+          );

-          next_state->id = self->next_state_id++;
-          array_push(&self->finished_states, *next_state);
-          if (next_state == state) {
-            array_erase(&self->states, i);
-            i--;
-            n--;
-          } else {
-            self->states.size--;
+          QueryStep *next_step = step + 1;
+
+          // If the pattern is now done, then remove it from the list of
+          // in-progress states, and add it to the list of finished states.
+          if (next_step->depth == PATTERN_DONE_MARKER) {
+            LOG("  finish pattern %u\n", next_state->pattern_index);
+
+            next_state->id = self->next_state_id++;
+            array_push(&self->finished_states, *next_state);
+            if (next_state == state) {
+              array_erase(&self->states, i);
+              i--;
+              n--;
+            } else {
+              self->states.size--;
+            }
          }
        }
      }

+
      // Continue descending if possible.
      if (ts_tree_cursor_goto_first_child(&self->cursor)) {
        self->depth++;
@ -1411,11 +1589,12 @@ bool ts_query_cursor_next_match(
  QueryState *state = &self->finished_states.contents[0];
  match->id = state->id;
  match->pattern_index = state->pattern_index;
-  match->capture_count = state->capture_count;
-  match->captures = capture_list_pool_get(
+  CaptureList *captures = capture_list_pool_get(
    &self->capture_list_pool,
    state->capture_list_id
  );
+  match->captures = captures->contents;
+  match->capture_count = captures->size;
  capture_list_pool_release(&self->capture_list_pool, state->capture_list_id);
  array_erase(&self->finished_states, 0);
  return true;
@ -1468,13 +1647,13 @@ bool ts_query_cursor_next_capture(
      uint32_t first_finished_pattern_index = first_unfinished_pattern_index;
      for (unsigned i = 0; i < self->finished_states.size; i++) {
        const QueryState *state = &self->finished_states.contents[i];
-        if (state->capture_count > state->consumed_capture_count) {
-          const TSQueryCapture *captures = capture_list_pool_get(
-            &self->capture_list_pool,
-            state->capture_list_id
-          );
+        CaptureList *captures = capture_list_pool_get(
+          &self->capture_list_pool,
+          state->capture_list_id
+        );
+        if (captures->size > state->consumed_capture_count) {
          uint32_t capture_byte = ts_node_start_byte(
-            captures[state->consumed_capture_count].node
+            captures->contents[state->consumed_capture_count].node
          );
          if (
            capture_byte < first_finished_capture_byte ||
@ -1506,11 +1685,12 @@ bool ts_query_cursor_next_capture(
        ];
        match->id = state->id;
        match->pattern_index = state->pattern_index;
-        match->capture_count = state->capture_count;
-        match->captures = capture_list_pool_get(
+        CaptureList *captures = capture_list_pool_get(
          &self->capture_list_pool,
          state->capture_list_id
        );
+        match->captures = captures->contents;
+        match->capture_count = captures->size;
        *capture_index = state->consumed_capture_count;
        state->consumed_capture_count++;
        return true;
--- a/src/tree_sitter/utf16.c
+++ b/src/tree_sitter/utf16.c
@ -1,33 +0,0 @@
-#include "./utf16.h"
-
-utf8proc_ssize_t utf16_iterate(
-  const utf8proc_uint8_t *string,
-  utf8proc_ssize_t length,
-  utf8proc_int32_t *code_point
-) {
-  if (length < 2) {
-    *code_point = -1;
-    return 0;
-  }
-
-  uint16_t *units = (uint16_t *)string;
-  uint16_t unit = units[0];
-
-  if (unit < 0xd800 || unit >= 0xe000) {
-    *code_point = unit;
-    return 2;
-  }
-
-  if (unit < 0xdc00) {
-    if (length >= 4) {
-      uint16_t next_unit = units[1];
-      if (next_unit >= 0xdc00 && next_unit < 0xe000) {
-        *code_point = 0x10000 + ((unit - 0xd800) << 10) + (next_unit - 0xdc00);
-        return 4;
-      }
-    }
-  }
-
-  *code_point = -1;
-  return 2;
-}
--- a/src/tree_sitter/utf16.h
+++ b/src/tree_sitter/utf16.h
@ -1,21 +0,0 @@
-#ifndef TREE_SITTER_UTF16_H_
-#define TREE_SITTER_UTF16_H_
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#include <stdint.h>
-#include <stdlib.h>
-#include "utf8proc.h"
-
-// Analogous to utf8proc's utf8proc_iterate function. Reads one code point from
-// the given UTF16 string and stores it in the location pointed to by `code_point`.
-// Returns the number of bytes in `string` that were read.
-utf8proc_ssize_t utf16_iterate(const utf8proc_uint8_t *, utf8proc_ssize_t, utf8proc_int32_t *);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif  // TREE_SITTER_UTF16_H_
--- a/third-party/CMakeLists.txt
+++ b/third-party/CMakeLists.txt
@ -35,7 +35,6 @@ option(USE_BUNDLED_LIBTERMKEY "Use the bundled libtermkey." ${USE_BUNDLED})
 option(USE_BUNDLED_LIBVTERM "Use the bundled libvterm." ${USE_BUNDLED})
 option(USE_BUNDLED_LIBUV "Use the bundled libuv." ${USE_BUNDLED})
 option(USE_BUNDLED_MSGPACK "Use the bundled msgpack." ${USE_BUNDLED})
-option(USE_BUNDLED_UTF8PROC "Use the bundled utf8proc." ${USE_BUNDLED})
 option(USE_BUNDLED_LUAJIT "Use the bundled version of luajit." ${USE_BUNDLED})
 option(USE_BUNDLED_LUAROCKS "Use the bundled version of luarocks." ${USE_BUNDLED})
 option(USE_BUNDLED_LUV "Use the bundled version of luv." ${USE_BUNDLED})
@ -197,9 +196,6 @@ set(GETTEXT_SHA256 66415634c6e8c3fa8b71362879ec7575e27da43da562c798a8a2f223e6e47
 set(LIBICONV_URL https://ftp.gnu.org/pub/gnu/libiconv/libiconv-1.15.tar.gz)
 set(LIBICONV_SHA256 ccf536620a45458d26ba83887a983b96827001e92a13847b45e4925cc8913178)

-set(UTF8PROC_URL https://github.com/JuliaStrings/utf8proc/archive/v2.2.0.tar.gz)
-set(UTF8PROC_SHA256 3f8fd1dbdb057ee5ba584a539d5cd1b3952141c0338557cb0bdf8cb9cfed5dbf)
-
 set(TREESITTER_C_URL https://github.com/tree-sitter/tree-sitter-c/archive/6002fcd.tar.gz)
 set(TREESITTER_C_SHA256 46f8d44fa886d9ddb92571bb6fa8b175992c8758eca749cb1217464e512b6e97)

@ -254,10 +250,6 @@ if(USE_BUNDLED_LIBICONV)
  include(BuildLibiconv)
 endif()

-if(USE_BUNDLED_UTF8PROC)
-  include(BuildUtf8proc)
-endif()
-
 if(USE_BUNDLED_TS_PARSERS)
  include(BuildTreesitterParsers)
 endif()
--- a/third-party/cmake/BuildUtf8proc.cmake
+++ b/third-party/cmake/BuildUtf8proc.cmake
@ -1,68 +0,0 @@
-include(CMakeParseArguments)
-
-# BuildUtf8proc(CONFIGURE_COMMAND ... BUILD_COMMAND ... INSTALL_COMMAND ...)
-# Reusable function to build utf8proc, wraps ExternalProject_Add.
-# Failing to pass a command argument will result in no command being run
-function(BuildUtf8proc)
-  cmake_parse_arguments(_utf8proc
-    ""
-    ""
-    "CONFIGURE_COMMAND;BUILD_COMMAND;INSTALL_COMMAND"
-    ${ARGN})
-
-  if(NOT _utf8proc_CONFIGURE_COMMAND AND NOT _utf8proc_BUILD_COMMAND
-       AND NOT _utf8proc_INSTALL_COMMAND)
-    message(FATAL_ERROR "Must pass at least one of CONFIGURE_COMMAND, BUILD_COMMAND, INSTALL_COMMAND")
-  endif()
-
-  ExternalProject_Add(utf8proc
-    PREFIX ${DEPS_BUILD_DIR}
-    URL ${UTF8PROC_URL}
-    DOWNLOAD_DIR ${DEPS_DOWNLOAD_DIR}/utf8proc
-    DOWNLOAD_COMMAND ${CMAKE_COMMAND}
-      -DPREFIX=${DEPS_BUILD_DIR}
-      -DDOWNLOAD_DIR=${DEPS_DOWNLOAD_DIR}/utf8proc
-      -DURL=${UTF8PROC_URL}
-      -DEXPECTED_SHA256=${UTF8PROC_SHA256}
-      -DTARGET=utf8proc
-      -DUSE_EXISTING_SRC_DIR=${USE_EXISTING_SRC_DIR}
-      -P ${CMAKE_CURRENT_SOURCE_DIR}/cmake/DownloadAndExtractFile.cmake
-    CONFIGURE_COMMAND "${_utf8proc_CONFIGURE_COMMAND}"
-    BUILD_COMMAND "${_utf8proc_BUILD_COMMAND}"
-    INSTALL_COMMAND "${_utf8proc_INSTALL_COMMAND}")
-endfunction()
-
-set(UTF8PROC_CONFIGURE_COMMAND ${CMAKE_COMMAND} ${DEPS_BUILD_DIR}/src/utf8proc
-  -DCMAKE_INSTALL_PREFIX=${DEPS_INSTALL_DIR}
-  -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-  -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}
-  "-DCMAKE_C_FLAGS:STRING=${CMAKE_C_COMPILER_ARG1} -fPIC"
-  -DCMAKE_GENERATOR=${CMAKE_GENERATOR})
-
-set(UTF8PROC_BUILD_COMMAND ${CMAKE_COMMAND} --build . --config ${CMAKE_BUILD_TYPE})
-set(UTF8PROC_INSTALL_COMMAND ${CMAKE_COMMAND} --build . --target install --config ${CMAKE_BUILD_TYPE})
-
-if(MINGW AND CMAKE_CROSSCOMPILING)
-  get_filename_component(TOOLCHAIN ${CMAKE_TOOLCHAIN_FILE} REALPATH)
-  set(UTF8PROC_CONFIGURE_COMMAND ${CMAKE_COMMAND} ${DEPS_BUILD_DIR}/src/utf8proc
-    -DCMAKE_INSTALL_PREFIX=${DEPS_INSTALL_DIR}
-    # Pass toolchain
-    -DCMAKE_TOOLCHAIN_FILE=${TOOLCHAIN}
-    -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}
-    # Hack to avoid -rdynamic in Mingw
-    -DCMAKE_SHARED_LIBRARY_LINK_C_FLAGS="")
-elseif(MSVC)
-  # Same as Unix without fPIC
-  set(UTF8PROC_CONFIGURE_COMMAND ${CMAKE_COMMAND} ${DEPS_BUILD_DIR}/src/utf8proc
-    -DCMAKE_INSTALL_PREFIX=${DEPS_INSTALL_DIR}
-    -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-    "-DCMAKE_C_FLAGS:STRING=${CMAKE_C_COMPILER_ARG1}"
-    -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}
-    # Make sure we use the same generator, otherwise we may
-    # accidentally end up using different MSVC runtimes
-    -DCMAKE_GENERATOR=${CMAKE_GENERATOR})
-endif()
-
-BuildUtf8proc(CONFIGURE_COMMAND ${UTF8PROC_CONFIGURE_COMMAND}
-  BUILD_COMMAND ${UTF8PROC_BUILD_COMMAND}
-  INSTALL_COMMAND ${UTF8PROC_INSTALL_COMMAND})