#include "tree_sitter/api.h" #include "./alloc.h" #include "./array.h" #include "./bits.h" #include "./language.h" #include "./point.h" #include "./tree_cursor.h" #include "./unicode.h" #include /* * Stream - A sequence of unicode characters derived from a UTF8 string. * This struct is used in parsing queries from S-expressions. */ typedef struct { const char *input; const char *end; int32_t next; uint8_t next_size; } Stream; /* * QueryStep - A step in the process of matching a query. Each node within * a query S-expression maps to one of these steps. An entire pattern is * represented as a sequence of these steps. Fields: * * - `symbol` - The grammar symbol to match. A zero value represents the * wildcard symbol, '*'. * - `field` - The field name to match. A zero value means that a field name * was not specified. * - `capture_id` - An integer representing the name of the capture associated * with this node in the pattern. A `NONE` value means this node is not * captured in this pattern. * - `depth` - The depth where this node occurs in the pattern. The root node * of the pattern has depth zero. */ typedef struct { TSSymbol symbol; TSFieldId field; uint16_t capture_id; uint16_t depth: 15; bool contains_captures: 1; } QueryStep; /* * Slice - A slice of an external array. Within a query, capture names, * literal string values, and predicate step informations are stored in three * contiguous arrays. Individual captures, string values, and predicates are * represented as slices of these three arrays. */ typedef struct { uint32_t offset; uint32_t length; } Slice; /* * SymbolTable - a two-way mapping of strings to ids. */ typedef struct { Array(char) characters; Array(Slice) slices; } SymbolTable; /* * PatternEntry - Information about the starting point for matching a * particular pattern, consisting of the index of the pattern within the query, * and the index of the patter's first step in the shared `steps` array. These * entries are stored in a 'pattern map' - a sorted array that makes it * possible to efficiently lookup patterns based on the symbol for their first * step. */ typedef struct { uint16_t step_index; uint16_t pattern_index; } PatternEntry; /* * QueryState - The state of an in-progress match of a particular pattern * in a query. While executing, a `TSQueryCursor` must keep track of a number * of possible in-progress matches. Each of those possible matches is * represented as one of these states. */ typedef struct { uint16_t start_depth; uint16_t pattern_index; uint16_t step_index; uint16_t capture_count; uint16_t capture_list_id; uint16_t consumed_capture_count; uint32_t id; } QueryState; /* * CaptureListPool - A collection of *lists* of captures. Each QueryState * needs to maintain its own list of captures. They are all represented as * slices of one shared array. The CaptureListPool keeps track of which * parts of the shared array are currently in use by a QueryState. */ typedef struct { Array(TSQueryCapture) list; uint32_t usage_map; } CaptureListPool; /* * TSQuery - A tree query, compiled from a string of S-expressions. The query * itself is immutable. The mutable state used in the process of executing the * query is stored in a `TSQueryCursor`. */ struct TSQuery { SymbolTable captures; SymbolTable predicate_values; Array(QueryStep) steps; Array(PatternEntry) pattern_map; Array(TSQueryPredicateStep) predicate_steps; Array(Slice) predicates_by_pattern; Array(uint32_t) start_bytes_by_pattern; const TSLanguage *language; uint16_t max_capture_count; uint16_t wildcard_root_pattern_count; TSSymbol *symbol_map; }; /* * TSQueryCursor - A stateful struct used to execute a query on a tree. */ struct TSQueryCursor { const TSQuery *query; TSTreeCursor cursor; Array(QueryState) states; Array(QueryState) finished_states; CaptureListPool capture_list_pool; uint32_t depth; uint32_t start_byte; uint32_t end_byte; uint32_t next_state_id; TSPoint start_point; TSPoint end_point; bool ascending; }; static const TSQueryError PARENT_DONE = -1; static const uint8_t PATTERN_DONE_MARKER = UINT8_MAX; static const uint16_t NONE = UINT16_MAX; static const TSSymbol WILDCARD_SYMBOL = 0; static const uint16_t MAX_STATE_COUNT = 32; // #define LOG(...) fprintf(stderr, __VA_ARGS__) #define LOG(...) /********** * Stream **********/ // Advance to the next unicode code point in the stream. static bool stream_advance(Stream *self) { self->input += self->next_size; if (self->input < self->end) { uint32_t size = ts_decode_utf8( (const uint8_t *)self->input, self->end - self->input, &self->next ); if (size > 0) { self->next_size = size; return true; } } else { self->next_size = 0; self->next = '\0'; } return false; } // Reset the stream to the given input position, represented as a pointer // into the input string. static void stream_reset(Stream *self, const char *input) { self->input = input; self->next_size = 0; stream_advance(self); } static Stream stream_new(const char *string, uint32_t length) { Stream self = { .next = 0, .input = string, .end = string + length, }; stream_advance(&self); return self; } static void stream_skip_whitespace(Stream *stream) { for (;;) { if (iswspace(stream->next)) { stream_advance(stream); } else if (stream->next == ';') { // skip over comments stream_advance(stream); while (stream->next && stream->next != '\n') { if (!stream_advance(stream)) break; } } else { break; } } } static bool stream_is_ident_start(Stream *stream) { return iswalnum(stream->next) || stream->next == '_' || stream->next == '-'; } static void stream_scan_identifier(Stream *stream) { do { stream_advance(stream); } while ( iswalnum(stream->next) || stream->next == '_' || stream->next == '-' || stream->next == '.' || stream->next == '?' || stream->next == '!' ); } /****************** * CaptureListPool ******************/ static CaptureListPool capture_list_pool_new() { return (CaptureListPool) { .list = array_new(), .usage_map = UINT32_MAX, }; } static void capture_list_pool_reset(CaptureListPool *self, uint16_t list_size) { self->usage_map = UINT32_MAX; uint32_t total_size = MAX_STATE_COUNT * list_size; array_reserve(&self->list, total_size); self->list.size = total_size; } static void capture_list_pool_delete(CaptureListPool *self) { array_delete(&self->list); } static TSQueryCapture *capture_list_pool_get(CaptureListPool *self, uint16_t id) { return &self->list.contents[id * (self->list.size / MAX_STATE_COUNT)]; } static bool capture_list_pool_is_empty(const CaptureListPool *self) { return self->usage_map == 0; } static uint16_t capture_list_pool_acquire(CaptureListPool *self) { // In the usage_map bitmask, ones represent free lists, and zeros represent // lists that are in use. A free list id can quickly be found by counting // the leading zeros in the usage map. An id of zero corresponds to the // highest-order bit in the bitmask. uint16_t id = count_leading_zeros(self->usage_map); if (id == 32) return NONE; self->usage_map &= ~bitmask_for_index(id); return id; } static void capture_list_pool_release(CaptureListPool *self, uint16_t id) { self->usage_map |= bitmask_for_index(id); } /************** * SymbolTable **************/ static SymbolTable symbol_table_new() { return (SymbolTable) { .characters = array_new(), .slices = array_new(), }; } static void symbol_table_delete(SymbolTable *self) { array_delete(&self->characters); array_delete(&self->slices); } static int symbol_table_id_for_name( const SymbolTable *self, const char *name, uint32_t length ) { for (unsigned i = 0; i < self->slices.size; i++) { Slice slice = self->slices.contents[i]; if ( slice.length == length && !strncmp(&self->characters.contents[slice.offset], name, length) ) return i; } return -1; } static const char *symbol_table_name_for_id( const SymbolTable *self, uint16_t id, uint32_t *length ) { Slice slice = self->slices.contents[id]; *length = slice.length; return &self->characters.contents[slice.offset]; } static uint16_t symbol_table_insert_name( SymbolTable *self, const char *name, uint32_t length ) { int id = symbol_table_id_for_name(self, name, length); if (id >= 0) return (uint16_t)id; Slice slice = { .offset = self->characters.size, .length = length, }; array_grow_by(&self->characters, length + 1); memcpy(&self->characters.contents[slice.offset], name, length); self->characters.contents[self->characters.size - 1] = 0; array_push(&self->slices, slice); return self->slices.size - 1; } /********* * Query *********/ // The `pattern_map` contains a mapping from TSSymbol values to indices in the // `steps` array. For a given syntax node, the `pattern_map` makes it possible // to quickly find the starting steps of all of the patterns whose root matches // that node. Each entry has two fields: a `pattern_index`, which identifies one // of the patterns in the query, and a `step_index`, which indicates the start // offset of that pattern's steps within the `steps` array. // // The entries are sorted by the patterns' root symbols, and lookups use a // binary search. This ensures that the cost of this initial lookup step // scales logarithmically with the number of patterns in the query. // // This returns `true` if the symbol is present and `false` otherwise. // If the symbol is not present `*result` is set to the index where the // symbol should be inserted. static inline bool ts_query__pattern_map_search( const TSQuery *self, TSSymbol needle, uint32_t *result ) { uint32_t base_index = self->wildcard_root_pattern_count; uint32_t size = self->pattern_map.size - base_index; if (size == 0) { *result = base_index; return false; } while (size > 1) { uint32_t half_size = size / 2; uint32_t mid_index = base_index + half_size; TSSymbol mid_symbol = self->steps.contents[ self->pattern_map.contents[mid_index].step_index ].symbol; if (needle > mid_symbol) base_index = mid_index; size -= half_size; } TSSymbol symbol = self->steps.contents[ self->pattern_map.contents[base_index].step_index ].symbol; if (needle > symbol) { base_index++; if (base_index < self->pattern_map.size) { symbol = self->steps.contents[ self->pattern_map.contents[base_index].step_index ].symbol; } } *result = base_index; return needle == symbol; } // Insert a new pattern's start index into the pattern map, maintaining // the pattern map's ordering invariant. static inline void ts_query__pattern_map_insert( TSQuery *self, TSSymbol symbol, uint32_t start_step_index ) { uint32_t index; ts_query__pattern_map_search(self, symbol, &index); array_insert(&self->pattern_map, index, ((PatternEntry) { .step_index = start_step_index, .pattern_index = self->pattern_map.size, })); } static void ts_query__finalize_steps(TSQuery *self) { for (unsigned i = 0; i < self->steps.size; i++) { QueryStep *step = &self->steps.contents[i]; uint32_t depth = step->depth; if (step->capture_id != NONE) { step->contains_captures = true; } else { step->contains_captures = false; for (unsigned j = i + 1; j < self->steps.size; j++) { QueryStep *s = &self->steps.contents[j]; if (s->depth == PATTERN_DONE_MARKER || s->depth <= depth) break; if (s->capture_id != NONE) step->contains_captures = true; } } } } // Parse a single predicate associated with a pattern, adding it to the // query's internal `predicate_steps` array. Predicates are arbitrary // S-expressions associated with a pattern which are meant to be handled at // a higher level of abstraction, such as the Rust/JavaScript bindings. They // can contain '@'-prefixed capture names, double-quoted strings, and bare // symbols, which also represent strings. static TSQueryError ts_query__parse_predicate( TSQuery *self, Stream *stream ) { if (stream->next == ')') return PARENT_DONE; if (stream->next != '(') return TSQueryErrorSyntax; stream_advance(stream); stream_skip_whitespace(stream); unsigned step_count = 0; for (;;) { if (stream->next == ')') { stream_advance(stream); stream_skip_whitespace(stream); array_back(&self->predicates_by_pattern)->length++; array_push(&self->predicate_steps, ((TSQueryPredicateStep) { .type = TSQueryPredicateStepTypeDone, .value_id = 0, })); break; } // Parse an '@'-prefixed capture name else if (stream->next == '@') { stream_advance(stream); // Parse the capture name if (!stream_is_ident_start(stream)) return TSQueryErrorSyntax; const char *capture_name = stream->input; stream_scan_identifier(stream); uint32_t length = stream->input - capture_name; // Add the capture id to the first step of the pattern int capture_id = symbol_table_id_for_name( &self->captures, capture_name, length ); if (capture_id == -1) { stream_reset(stream, capture_name); return TSQueryErrorCapture; } array_back(&self->predicates_by_pattern)->length++; array_push(&self->predicate_steps, ((TSQueryPredicateStep) { .type = TSQueryPredicateStepTypeCapture, .value_id = capture_id, })); } // Parse a string literal else if (stream->next == '"') { stream_advance(stream); // Parse the string content const char *string_content = stream->input; while (stream->next != '"') { if (stream->next == '\n' || !stream_advance(stream)) { stream_reset(stream, string_content - 1); return TSQueryErrorSyntax; } } uint32_t length = stream->input - string_content; // Add a step for the node uint16_t id = symbol_table_insert_name( &self->predicate_values, string_content, length ); array_back(&self->predicates_by_pattern)->length++; array_push(&self->predicate_steps, ((TSQueryPredicateStep) { .type = TSQueryPredicateStepTypeString, .value_id = id, })); if (stream->next != '"') return TSQueryErrorSyntax; stream_advance(stream); } // Parse a bare symbol else if (stream_is_ident_start(stream)) { const char *symbol_start = stream->input; stream_scan_identifier(stream); uint32_t length = stream->input - symbol_start; uint16_t id = symbol_table_insert_name( &self->predicate_values, symbol_start, length ); array_back(&self->predicates_by_pattern)->length++; array_push(&self->predicate_steps, ((TSQueryPredicateStep) { .type = TSQueryPredicateStepTypeString, .value_id = id, })); } else { return TSQueryErrorSyntax; } step_count++; stream_skip_whitespace(stream); } return 0; } // Read one S-expression pattern from the stream, and incorporate it into // the query's internal state machine representation. For nested patterns, // this function calls itself recursively. static TSQueryError ts_query__parse_pattern( TSQuery *self, Stream *stream, uint32_t depth, uint32_t *capture_count ) { uint16_t starting_step_index = self->steps.size; if (stream->next == 0) return TSQueryErrorSyntax; // Finish the parent S-expression if (stream->next == ')') { return PARENT_DONE; } // Parse a parenthesized node expression else if (stream->next == '(') { stream_advance(stream); stream_skip_whitespace(stream); // Parse a nested list, which represents a pattern followed by // zero-or-more predicates. if (stream->next == '(' && depth == 0) { TSQueryError e = ts_query__parse_pattern(self, stream, 0, capture_count); if (e) return e; // Parse the predicates. stream_skip_whitespace(stream); for (;;) { TSQueryError e = ts_query__parse_predicate(self, stream); if (e == PARENT_DONE) { stream_advance(stream); stream_skip_whitespace(stream); return 0; } else if (e) { return e; } } } TSSymbol symbol; // Parse the wildcard symbol if (stream->next == '*') { symbol = WILDCARD_SYMBOL; stream_advance(stream); } // Parse a normal node name else if (stream_is_ident_start(stream)) { const char *node_name = stream->input; stream_scan_identifier(stream); uint32_t length = stream->input - node_name; symbol = ts_language_symbol_for_name( self->language, node_name, length, true ); if (!symbol) { stream_reset(stream, node_name); return TSQueryErrorNodeType; } } else { return TSQueryErrorSyntax; } // Add a step for the node. array_push(&self->steps, ((QueryStep) { .depth = depth, .symbol = symbol, .field = 0, .capture_id = NONE, .contains_captures = false, })); // Parse the child patterns stream_skip_whitespace(stream); for (;;) { TSQueryError e = ts_query__parse_pattern(self, stream, depth + 1, capture_count); if (e == PARENT_DONE) { stream_advance(stream); break; } else if (e) { return e; } } } // Parse a double-quoted anonymous leaf node expression else if (stream->next == '"') { stream_advance(stream); // Parse the string content const char *string_content = stream->input; while (stream->next != '"') { if (!stream_advance(stream)) { stream_reset(stream, string_content - 1); return TSQueryErrorSyntax; } } uint32_t length = stream->input - string_content; // Add a step for the node TSSymbol symbol = ts_language_symbol_for_name( self->language, string_content, length, false ); if (!symbol) { stream_reset(stream, string_content); return TSQueryErrorNodeType; } array_push(&self->steps, ((QueryStep) { .depth = depth, .symbol = symbol, .field = 0, .capture_id = NONE, .contains_captures = false, })); if (stream->next != '"') return TSQueryErrorSyntax; stream_advance(stream); } // Parse a field-prefixed pattern else if (stream_is_ident_start(stream)) { // Parse the field name const char *field_name = stream->input; stream_scan_identifier(stream); uint32_t length = stream->input - field_name; stream_skip_whitespace(stream); if (stream->next != ':') { stream_reset(stream, field_name); return TSQueryErrorSyntax; } stream_advance(stream); stream_skip_whitespace(stream); // Parse the pattern uint32_t step_index = self->steps.size; TSQueryError e = ts_query__parse_pattern(self, stream, depth, capture_count); if (e == PARENT_DONE) return TSQueryErrorSyntax; if (e) return e; // Add the field name to the first step of the pattern TSFieldId field_id = ts_language_field_id_for_name( self->language, field_name, length ); if (!field_id) { stream->input = field_name; return TSQueryErrorField; } self->steps.contents[step_index].field = field_id; } // Parse a wildcard pattern else if (stream->next == '*') { stream_advance(stream); stream_skip_whitespace(stream); // Add a step that matches any kind of node array_push(&self->steps, ((QueryStep) { .depth = depth, .symbol = WILDCARD_SYMBOL, .field = 0, .contains_captures = false, })); } else { return TSQueryErrorSyntax; } stream_skip_whitespace(stream); // Parse an '@'-prefixed capture pattern if (stream->next == '@') { stream_advance(stream); // Parse the capture name if (!stream_is_ident_start(stream)) return TSQueryErrorSyntax; const char *capture_name = stream->input; stream_scan_identifier(stream); uint32_t length = stream->input - capture_name; // Add the capture id to the first step of the pattern uint16_t capture_id = symbol_table_insert_name( &self->captures, capture_name, length ); self->steps.contents[starting_step_index].capture_id = capture_id; (*capture_count)++; stream_skip_whitespace(stream); } return 0; } TSQuery *ts_query_new( const TSLanguage *language, const char *source, uint32_t source_len, uint32_t *error_offset, TSQueryError *error_type ) { TSSymbol *symbol_map; if (ts_language_version(language) >= TREE_SITTER_LANGUAGE_VERSION_WITH_SYMBOL_DEDUPING) { symbol_map = NULL; } else { // Work around the fact that multiple symbols can currently be // associated with the same name, due to "simple aliases". // In the next language ABI version, this map will be contained // in the language's `public_symbol_map` field. uint32_t symbol_count = ts_language_symbol_count(language); symbol_map = ts_malloc(sizeof(TSSymbol) * symbol_count); for (unsigned i = 0; i < symbol_count; i++) { const char *name = ts_language_symbol_name(language, i); const TSSymbolType symbol_type = ts_language_symbol_type(language, i); symbol_map[i] = i; for (unsigned j = 0; j < i; j++) { if (ts_language_symbol_type(language, j) == symbol_type) { if (!strcmp(name, ts_language_symbol_name(language, j))) { symbol_map[i] = j; break; } } } } } TSQuery *self = ts_malloc(sizeof(TSQuery)); *self = (TSQuery) { .steps = array_new(), .pattern_map = array_new(), .captures = symbol_table_new(), .predicate_values = symbol_table_new(), .predicate_steps = array_new(), .predicates_by_pattern = array_new(), .symbol_map = symbol_map, .wildcard_root_pattern_count = 0, .max_capture_count = 0, .language = language, }; // Parse all of the S-expressions in the given string. Stream stream = stream_new(source, source_len); stream_skip_whitespace(&stream); uint32_t start_step_index; while (stream.input < stream.end) { start_step_index = self->steps.size; uint32_t capture_count = 0; array_push(&self->start_bytes_by_pattern, stream.input - source); array_push(&self->predicates_by_pattern, ((Slice) { .offset = self->predicate_steps.size, .length = 0, })); *error_type = ts_query__parse_pattern(self, &stream, 0, &capture_count); array_push(&self->steps, ((QueryStep) { .depth = PATTERN_DONE_MARKER })); // If any pattern could not be parsed, then report the error information // and terminate. if (*error_type) { *error_offset = stream.input - source; ts_query_delete(self); return NULL; } // Maintain a map that can look up patterns for a given root symbol. ts_query__pattern_map_insert( self, self->steps.contents[start_step_index].symbol, start_step_index ); if (self->steps.contents[start_step_index].symbol == WILDCARD_SYMBOL) { self->wildcard_root_pattern_count++; } // Keep track of the maximum number of captures in pattern, because // that numer determines how much space is needed to store each capture // list. if (capture_count > self->max_capture_count) { self->max_capture_count = capture_count; } } ts_query__finalize_steps(self); return self; } void ts_query_delete(TSQuery *self) { if (self) { array_delete(&self->steps); array_delete(&self->pattern_map); array_delete(&self->predicate_steps); array_delete(&self->predicates_by_pattern); array_delete(&self->start_bytes_by_pattern); symbol_table_delete(&self->captures); symbol_table_delete(&self->predicate_values); ts_free(self->symbol_map); ts_free(self); } } uint32_t ts_query_pattern_count(const TSQuery *self) { return self->predicates_by_pattern.size; } uint32_t ts_query_capture_count(const TSQuery *self) { return self->captures.slices.size; } uint32_t ts_query_string_count(const TSQuery *self) { return self->predicate_values.slices.size; } const char *ts_query_capture_name_for_id( const TSQuery *self, uint32_t index, uint32_t *length ) { return symbol_table_name_for_id(&self->captures, index, length); } const char *ts_query_string_value_for_id( const TSQuery *self, uint32_t index, uint32_t *length ) { return symbol_table_name_for_id(&self->predicate_values, index, length); } const TSQueryPredicateStep *ts_query_predicates_for_pattern( const TSQuery *self, uint32_t pattern_index, uint32_t *step_count ) { Slice slice = self->predicates_by_pattern.contents[pattern_index]; *step_count = slice.length; return &self->predicate_steps.contents[slice.offset]; } uint32_t ts_query_start_byte_for_pattern( const TSQuery *self, uint32_t pattern_index ) { return self->start_bytes_by_pattern.contents[pattern_index]; } void ts_query_disable_capture( TSQuery *self, const char *name, uint32_t length ) { // Remove capture information for any pattern step that previously // captured with the given name. int id = symbol_table_id_for_name(&self->captures, name, length); if (id != -1) { for (unsigned i = 0; i < self->steps.size; i++) { QueryStep *step = &self->steps.contents[i]; if (step->capture_id == id) { step->capture_id = NONE; } } ts_query__finalize_steps(self); } } void ts_query_disable_pattern( TSQuery *self, uint32_t pattern_index ) { // Remove the given pattern from the pattern map. Its steps will still // be in the `steps` array, but they will never be read. for (unsigned i = 0; i < self->pattern_map.size; i++) { PatternEntry *pattern = &self->pattern_map.contents[i]; if (pattern->pattern_index == pattern_index) { array_erase(&self->pattern_map, i); i--; } } } /*************** * QueryCursor ***************/ TSQueryCursor *ts_query_cursor_new() { TSQueryCursor *self = ts_malloc(sizeof(TSQueryCursor)); *self = (TSQueryCursor) { .ascending = false, .states = array_new(), .finished_states = array_new(), .capture_list_pool = capture_list_pool_new(), .start_byte = 0, .end_byte = UINT32_MAX, .start_point = {0, 0}, .end_point = POINT_MAX, }; array_reserve(&self->states, MAX_STATE_COUNT); array_reserve(&self->finished_states, MAX_STATE_COUNT); return self; } void ts_query_cursor_delete(TSQueryCursor *self) { array_delete(&self->states); array_delete(&self->finished_states); ts_tree_cursor_delete(&self->cursor); capture_list_pool_delete(&self->capture_list_pool); ts_free(self); } void ts_query_cursor_exec( TSQueryCursor *self, const TSQuery *query, TSNode node ) { array_clear(&self->states); array_clear(&self->finished_states); ts_tree_cursor_reset(&self->cursor, node); capture_list_pool_reset(&self->capture_list_pool, query->max_capture_count); self->next_state_id = 0; self->depth = 0; self->ascending = false; self->query = query; } void ts_query_cursor_set_byte_range( TSQueryCursor *self, uint32_t start_byte, uint32_t end_byte ) { if (end_byte == 0) { start_byte = 0; end_byte = UINT32_MAX; } self->start_byte = start_byte; self->end_byte = end_byte; } void ts_query_cursor_set_point_range( TSQueryCursor *self, TSPoint start_point, TSPoint end_point ) { if (end_point.row == 0 && end_point.column == 0) { start_point = POINT_ZERO; end_point = POINT_MAX; } self->start_point = start_point; self->end_point = end_point; } // Search through all of the in-progress states, and find the captured // node that occurs earliest in the document. static bool ts_query_cursor__first_in_progress_capture( TSQueryCursor *self, uint32_t *state_index, uint32_t *byte_offset, uint32_t *pattern_index ) { bool result = false; for (unsigned i = 0; i < self->states.size; i++) { const QueryState *state = &self->states.contents[i]; if (state->capture_count > 0) { const TSQueryCapture *captures = capture_list_pool_get( &self->capture_list_pool, state->capture_list_id ); uint32_t capture_byte = ts_node_start_byte(captures[0].node); if ( !result || capture_byte < *byte_offset || ( capture_byte == *byte_offset && state->pattern_index < *pattern_index ) ) { result = true; *state_index = i; *byte_offset = capture_byte; *pattern_index = state->pattern_index; } } } return result; } static bool ts_query__cursor_add_state( TSQueryCursor *self, const PatternEntry *pattern ) { uint32_t list_id = capture_list_pool_acquire(&self->capture_list_pool); // If there are no capture lists left in the pool, then terminate whichever // state has captured the earliest node in the document, and steal its // capture list. if (list_id == NONE) { uint32_t state_index, byte_offset, pattern_index; if (ts_query_cursor__first_in_progress_capture( self, &state_index, &byte_offset, &pattern_index )) { LOG( " abandon state. index:%u, pattern:%u, offset:%u.\n", state_index, pattern_index, byte_offset ); list_id = self->states.contents[state_index].capture_list_id; array_erase(&self->states, state_index); } else { LOG(" too many finished states.\n"); return false; } } LOG(" start state. pattern:%u\n", pattern->pattern_index); array_push(&self->states, ((QueryState) { .capture_list_id = list_id, .step_index = pattern->step_index, .pattern_index = pattern->pattern_index, .start_depth = self->depth, .capture_count = 0, .consumed_capture_count = 0, })); return true; } static QueryState *ts_query__cursor_copy_state( TSQueryCursor *self, const QueryState *state ) { uint32_t new_list_id = capture_list_pool_acquire(&self->capture_list_pool); if (new_list_id == NONE) return NULL; array_push(&self->states, *state); QueryState *new_state = array_back(&self->states); new_state->capture_list_id = new_list_id; TSQueryCapture *old_captures = capture_list_pool_get( &self->capture_list_pool, state->capture_list_id ); TSQueryCapture *new_captures = capture_list_pool_get( &self->capture_list_pool, new_list_id ); memcpy(new_captures, old_captures, state->capture_count * sizeof(TSQueryCapture)); return new_state; } // Walk the tree, processing patterns until at least one pattern finishes, // If one or more patterns finish, return `true` and store their states in the // `finished_states` array. Multiple patterns can finish on the same node. If // there are no more matches, return `false`. static inline bool ts_query_cursor__advance(TSQueryCursor *self) { do { if (self->ascending) { LOG("leave node. type:%s\n", ts_node_type(ts_tree_cursor_current_node(&self->cursor))); // When leaving a node, remove any unfinished states whose next step // needed to match something within that node. uint32_t deleted_count = 0; for (unsigned i = 0, n = self->states.size; i < n; i++) { QueryState *state = &self->states.contents[i]; QueryStep *step = &self->query->steps.contents[state->step_index]; if ((uint32_t)state->start_depth + (uint32_t)step->depth > self->depth) { LOG( " failed to match. pattern:%u, step:%u\n", state->pattern_index, state->step_index ); capture_list_pool_release( &self->capture_list_pool, state->capture_list_id ); deleted_count++; } else if (deleted_count > 0) { self->states.contents[i - deleted_count] = *state; } } self->states.size -= deleted_count; if (ts_tree_cursor_goto_next_sibling(&self->cursor)) { self->ascending = false; } else if (ts_tree_cursor_goto_parent(&self->cursor)) { self->depth--; } else { return self->finished_states.size > 0; } } else { bool can_have_later_siblings; bool can_have_later_siblings_with_this_field; TSFieldId field_id = ts_tree_cursor_current_status( &self->cursor, &can_have_later_siblings, &can_have_later_siblings_with_this_field ); TSNode node = ts_tree_cursor_current_node(&self->cursor); TSSymbol symbol = ts_node_symbol(node); if (symbol != ts_builtin_sym_error && self->query->symbol_map) { symbol = self->query->symbol_map[symbol]; } // If this node is before the selected range, then avoid descending // into it. if ( ts_node_end_byte(node) <= self->start_byte || point_lte(ts_node_end_point(node), self->start_point) ) { if (!ts_tree_cursor_goto_next_sibling(&self->cursor)) { self->ascending = true; } continue; } // If this node is after the selected range, then stop walking. if ( self->end_byte <= ts_node_start_byte(node) || point_lte(self->end_point, ts_node_start_point(node)) ) return false; LOG( "enter node. type:%s, field:%s, row:%u state_count:%u, finished_state_count:%u, can_have_later_siblings:%d, can_have_later_siblings_with_this_field:%d\n", ts_node_type(node), ts_language_field_name_for_id(self->query->language, field_id), ts_node_start_point(node).row, self->states.size, self->finished_states.size, can_have_later_siblings, can_have_later_siblings_with_this_field ); // Add new states for any patterns whose root node is a wildcard. for (unsigned i = 0; i < self->query->wildcard_root_pattern_count; i++) { PatternEntry *pattern = &self->query->pattern_map.contents[i]; QueryStep *step = &self->query->steps.contents[pattern->step_index]; // If this node matches the first step of the pattern, then add a new // state at the start of this pattern. if (step->field && field_id != step->field) continue; if (!ts_query__cursor_add_state(self, pattern)) break; } // Add new states for any patterns whose root node matches this node. unsigned i; if (ts_query__pattern_map_search(self->query, symbol, &i)) { PatternEntry *pattern = &self->query->pattern_map.contents[i]; QueryStep *step = &self->query->steps.contents[pattern->step_index]; do { // If this node matches the first step of the pattern, then add a new // state at the start of this pattern. if (step->field && field_id != step->field) continue; if (!ts_query__cursor_add_state(self, pattern)) break; // Advance to the next pattern whose root node matches this node. i++; if (i == self->query->pattern_map.size) break; pattern = &self->query->pattern_map.contents[i]; step = &self->query->steps.contents[pattern->step_index]; } while (step->symbol == symbol); } // Update all of the in-progress states with current node. for (unsigned i = 0, n = self->states.size; i < n; i++) { QueryState *state = &self->states.contents[i]; QueryStep *step = &self->query->steps.contents[state->step_index]; // Check that the node matches all of the criteria for the next // step of the pattern.if ( if ((uint32_t)state->start_depth + (uint32_t)step->depth != self->depth) continue; // Determine if this node matches this step of the pattern, and also // if this node can have later siblings that match this step of the // pattern. bool node_does_match = !step->symbol || step->symbol == symbol; bool later_sibling_can_match = can_have_later_siblings; if (step->field) { if (step->field == field_id) { if (!can_have_later_siblings_with_this_field) { later_sibling_can_match = false; } } else { node_does_match = false; } } if (!node_does_match) { if (!later_sibling_can_match) { LOG( " discard state. pattern:%u, step:%u\n", state->pattern_index, state->step_index ); capture_list_pool_release( &self->capture_list_pool, state->capture_list_id ); array_erase(&self->states, i); i--; n--; } continue; } // Some patterns can match their root node in multiple ways, // capturing different children. If this pattern step could match // later children within the same parent, then this query state // cannot simply be updated in place. It must be split into two // states: one that matches this node, and one which skips over // this node, to preserve the possibility of matching later // siblings. QueryState *next_state = state; if ( step->depth > 0 && step->contains_captures && later_sibling_can_match ) { QueryState *copy = ts_query__cursor_copy_state(self, state); if (copy) { LOG( " split state. pattern:%u, step:%u\n", copy->pattern_index, copy->step_index ); next_state = copy; } else { LOG(" canot split state.\n"); } } LOG( " advance state. pattern:%u, step:%u\n", next_state->pattern_index, next_state->step_index ); // If the current node is captured in this pattern, add it to the // capture list. if (step->capture_id != NONE) { LOG( " capture node. pattern:%u, capture_id:%u\n", next_state->pattern_index, step->capture_id ); TSQueryCapture *capture_list = capture_list_pool_get( &self->capture_list_pool, next_state->capture_list_id ); capture_list[next_state->capture_count++] = (TSQueryCapture) { node, step->capture_id }; } // If the pattern is now done, then remove it from the list of // in-progress states, and add it to the list of finished states. next_state->step_index++; QueryStep *next_step = step + 1; if (next_step->depth == PATTERN_DONE_MARKER) { LOG(" finish pattern %u\n", next_state->pattern_index); next_state->id = self->next_state_id++; array_push(&self->finished_states, *next_state); if (next_state == state) { array_erase(&self->states, i); i--; n--; } else { self->states.size--; } } } // Continue descending if possible. if (ts_tree_cursor_goto_first_child(&self->cursor)) { self->depth++; } else { self->ascending = true; } } } while (self->finished_states.size == 0); return true; } bool ts_query_cursor_next_match( TSQueryCursor *self, TSQueryMatch *match ) { if (self->finished_states.size == 0) { if (!ts_query_cursor__advance(self)) { return false; } } QueryState *state = &self->finished_states.contents[0]; match->id = state->id; match->pattern_index = state->pattern_index; match->capture_count = state->capture_count; match->captures = capture_list_pool_get( &self->capture_list_pool, state->capture_list_id ); capture_list_pool_release(&self->capture_list_pool, state->capture_list_id); array_erase(&self->finished_states, 0); return true; } void ts_query_cursor_remove_match( TSQueryCursor *self, uint32_t match_id ) { for (unsigned i = 0; i < self->finished_states.size; i++) { const QueryState *state = &self->finished_states.contents[i]; if (state->id == match_id) { capture_list_pool_release( &self->capture_list_pool, state->capture_list_id ); array_erase(&self->finished_states, i); return; } } } bool ts_query_cursor_next_capture( TSQueryCursor *self, TSQueryMatch *match, uint32_t *capture_index ) { for (;;) { // The goal here is to return captures in order, even though they may not // be discovered in order, because patterns can overlap. If there are any // finished patterns, then try to find one that contains a capture that // is *definitely* before any capture in an *unfinished* pattern. if (self->finished_states.size > 0) { // First, identify the position of the earliest capture in an unfinished // match. For a finished capture to be returned, it must be *before* // this position. uint32_t first_unfinished_capture_byte = UINT32_MAX; uint32_t first_unfinished_pattern_index = UINT32_MAX; uint32_t first_unfinished_state_index; ts_query_cursor__first_in_progress_capture( self, &first_unfinished_state_index, &first_unfinished_capture_byte, &first_unfinished_pattern_index ); // Find the earliest capture in a finished match. int first_finished_state_index = -1; uint32_t first_finished_capture_byte = first_unfinished_capture_byte; uint32_t first_finished_pattern_index = first_unfinished_pattern_index; for (unsigned i = 0; i < self->finished_states.size; i++) { const QueryState *state = &self->finished_states.contents[i]; if (state->capture_count > state->consumed_capture_count) { const TSQueryCapture *captures = capture_list_pool_get( &self->capture_list_pool, state->capture_list_id ); uint32_t capture_byte = ts_node_start_byte( captures[state->consumed_capture_count].node ); if ( capture_byte < first_finished_capture_byte || ( capture_byte == first_finished_capture_byte && state->pattern_index < first_finished_pattern_index ) ) { first_finished_state_index = i; first_finished_capture_byte = capture_byte; first_finished_pattern_index = state->pattern_index; } } else { capture_list_pool_release( &self->capture_list_pool, state->capture_list_id ); array_erase(&self->finished_states, i); i--; } } // If there is finished capture that is clearly before any unfinished // capture, then return its match, and its capture index. Internally // record the fact that the capture has been 'consumed'. if (first_finished_state_index != -1) { QueryState *state = &self->finished_states.contents[ first_finished_state_index ]; match->id = state->id; match->pattern_index = state->pattern_index; match->capture_count = state->capture_count; match->captures = capture_list_pool_get( &self->capture_list_pool, state->capture_list_id ); *capture_index = state->consumed_capture_count; state->consumed_capture_count++; return true; } if (capture_list_pool_is_empty(&self->capture_list_pool)) { LOG( " abandon state. index:%u, pattern:%u, offset:%u.\n", first_unfinished_state_index, first_unfinished_pattern_index, first_unfinished_capture_byte ); capture_list_pool_release( &self->capture_list_pool, self->states.contents[first_unfinished_state_index].capture_list_id ); array_erase(&self->states, first_unfinished_state_index); } } // If there are no finished matches that are ready to be returned, then // continue finding more matches. if (!ts_query_cursor__advance(self)) return false; } } #undef LOG