Skip to content

Analyzer

Tree-sitter based code analysis.

This module provides functionality for analyzing source code using tree- sitter.

logger module-attribute

logger = getLogger(__name__)

LANGUAGE_NAMES module-attribute

LANGUAGE_NAMES: dict[str, SupportedLanguage] = {
	"python": "python",
	"javascript": "javascript",
	"typescript": "typescript",
}

get_language_by_extension

get_language_by_extension(file_path: Path) -> str | None

Get language name from file extension.

Parameters:

Name Type Description Default
file_path Path

Path to the file

required

Returns:

Type Description
str | None

Language name if supported, None otherwise

Source code in src/codemap/processor/tree_sitter/analyzer.py
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
def get_language_by_extension(file_path: Path) -> str | None:
	"""
	Get language name from file extension.

	Args:
	    file_path: Path to the file

	Returns:
	    Language name if supported, None otherwise

	"""
	ext = file_path.suffix
	for lang, config in LANGUAGE_CONFIGS.items():
		if ext in config.file_extensions:
			return lang
	return None

TreeSitterAnalyzer

Analyzer for source code using tree-sitter.

Source code in src/codemap/processor/tree_sitter/analyzer.py
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
class TreeSitterAnalyzer:
	"""Analyzer for source code using tree-sitter."""

	def __init__(self) -> None:
		"""Initialize the tree-sitter analyzer."""
		self.parsers: dict[str, Parser] = {}
		self.ast_cache: dict[Path, tuple[float, Node, Parser]] = {}

	def get_parser(self, language: str) -> Parser | None:
		"""
		Get the parser for a language, loading it if necessary.

		Args:
		    language: The language to get a parser for

		Returns:
		    A tree-sitter parser or None if not supported
		"""
		if language in self.parsers:
			return self.parsers[language]

		# Lazy load the parser
		ts_lang_name = LANGUAGE_NAMES.get(language)
		if not ts_lang_name:
			return None
		try:
			lang_obj: Language = get_language(ts_lang_name)
			parser: Parser = Parser()
			parser.language = lang_obj
			self.parsers[language] = parser
			return parser
		except (ValueError, RuntimeError, ImportError) as e:
			logger.debug("Failed to load language %s: %s", language, str(e))
			return None

	def parse_file(
		self, file_path: Path, language: str | None = None
	) -> tuple[Node | None, str, Parser | None, bytes | None]:
		"""
		Parse a file and return its root node, determined language, the parser used, and content_bytes.

		Utilizes a cache to avoid re-parsing unchanged files.

		Args:
		    file_path: Path to the file to parse
		    language: Optional language override

		Returns:
		    A tuple containing the parse tree root node (or None if parsing failed),
		    the determined language string, the parser instance, and the file content as bytes (if read).
		"""
		# Determine language if not provided
		determined_language = language
		if not determined_language:
			determined_language = get_language_by_extension(file_path)
			if not determined_language:
				logger.debug("Could not determine language for file %s", file_path)
				return None, "", None, None

		try:
			current_mtime = file_path.stat().st_mtime
			if file_path in self.ast_cache:
				# Assuming cache stores (mtime, ast_root, parser, content_bytes)
				# For now, let's simplify and not cache content_bytes directly with AST to avoid large cache items
				# We will re-read if cache hit for AST, but this is still better than re-parsing.
				# A more advanced cache could handle content_bytes.
				cached_mtime, cached_tree_root, cached_parser = self.ast_cache[file_path]
				if current_mtime == cached_mtime:
					logger.debug("AST cache hit for: %s. Content will be re-read by caller if needed.", file_path)
					# To return content_bytes here, we would need to have cached it or re-read it.
					# For simplicity of this step, parse_file will return None for content_bytes
					# on a pure AST cache hit.
					# The caller (analyze_file) will then read it. The main win (no re-parse) is achieved.
					return cached_tree_root, determined_language, cached_parser, None
		except FileNotFoundError:
			logger.warning("File not found during mtime check: %s", file_path)
			if file_path in self.ast_cache:
				del self.ast_cache[file_path]
			return None, determined_language, None, None

		parser = self.get_parser(determined_language)
		if not parser:
			logger.debug("No parser for language %s", determined_language)
			return None, determined_language, None, None

		content_bytes_read: bytes | None = None  # Initialize here
		try:
			with file_path.open("rb") as f:
				content_bytes_read = f.read()
			tree = parser.parse(content_bytes_read)
			root_node = tree.root_node
			current_mtime_after_read = file_path.stat().st_mtime
			self.ast_cache[file_path] = (
				current_mtime_after_read,
				root_node,
				parser,
			)  # Not caching content_bytes in AST cache for now
			logger.debug("Parsed and cached AST for: %s", file_path)
			return root_node, determined_language, parser, content_bytes_read  # Return read content_bytes
		except FileNotFoundError:
			logger.warning("File not found during parsing: %s", file_path)
			if file_path in self.ast_cache:
				del self.ast_cache[file_path]
			# If file not found, content_bytes_read would not have been assigned (or remains None)
			return None, determined_language, parser, None
		except Exception:
			logger.exception("Failed to parse file %s", file_path)
			# content_bytes_read will be None if open failed, or the read bytes if parse failed
			return None, determined_language, parser, content_bytes_read

	def get_syntax_handler(self, language: str) -> LanguageSyntaxHandler | None:
		"""
		Get the syntax handler for a language.

		Args:
		    language: The language to get a handler for

		Returns:
		    A syntax handler or None if not supported

		"""
		handler_class = LANGUAGE_HANDLERS.get(language)
		if not handler_class:
			return None
		return handler_class()

	def analyze_node(
		self,
		node: Node,
		content_bytes: bytes,
		file_path: Path,
		language: str,
		handler: LanguageSyntaxHandler,
		parent_node: Node | None = None,
	) -> dict:
		"""
		Analyze a tree-sitter node and return structured information.

		Args:
		    node: The tree-sitter node
		    content_bytes: Source code content as bytes
		    file_path: Path to the source file
		    language: Programming language
		    handler: Language-specific syntax handler
		    parent_node: Parent node if any

		Returns:
		    Dict with node analysis information

		"""
		# Check if we should skip this node
		if handler.should_skip_node(node):
			return {}

		# Get entity type for this node from the handler
		entity_type = handler.get_entity_type(node, parent_node, content_bytes)

		# Skip unknown/uninteresting nodes unless they might contain interesting children
		if entity_type == EntityType.UNKNOWN and not node.named_child_count > 0:
			return {}

		# Get name and other metadata
		name = handler.extract_name(node, content_bytes)
		docstring_text, docstring_node = handler.find_docstring(node, content_bytes)

		# Get node content
		try:
			node_content = content_bytes[node.start_byte : node.end_byte].decode("utf-8", errors="ignore")
		except (UnicodeDecodeError, IndexError):
			node_content = ""

		# Extract dependencies from import statements
		dependencies = []
		if entity_type == EntityType.IMPORT:
			try:
				dependencies = handler.extract_imports(node, content_bytes)
			except (AttributeError, UnicodeDecodeError, IndexError, ValueError) as e:
				logger.debug("Failed to extract dependencies: %s", e)

		# Build result
		result = {
			"type": entity_type.name if entity_type != EntityType.UNKNOWN else "UNKNOWN",
			"name": name,
			"location": {
				"start_line": node.start_point[0] + 1,  # Convert to 1-based
				"end_line": node.end_point[0] + 1,
				"start_col": node.start_point[1],
				"end_col": node.end_point[1],
			},
			"docstring": docstring_text,
			"content": node_content,
			"children": [],
			"language": language,
		}

		# Add dependencies only if they exist to keep the output clean
		if dependencies:
			result["dependencies"] = dependencies

		# Extract function calls if the entity is a function or method
		calls = []
		if entity_type in (EntityType.FUNCTION, EntityType.METHOD):
			body_node = handler.get_body_node(node)
			if body_node:
				try:
					calls = handler.extract_calls(body_node, content_bytes)
				except (AttributeError, IndexError, UnicodeDecodeError, ValueError) as e:
					logger.debug("Failed to extract calls for %s: %s", name or "<anonymous>", e)

		# Add calls only if they exist
		if calls:
			result["calls"] = calls

		# Process child nodes
		body_node = handler.get_body_node(node)
		children_to_process = handler.get_children_to_process(node, body_node)

		for child in children_to_process:
			if docstring_node and child == docstring_node:
				continue  # Skip docstring node

			child_result = self.analyze_node(child, content_bytes, file_path, language, handler, node)

			if child_result:  # Only add non-empty results
				result["children"].append(child_result)

		return result

	def analyze_file(
		self,
		file_path: Path,
		language: str | None = None,
	) -> dict:
		"""
		Analyze a file and return its structural information.

		Uses cached ASTs.
		The returned dictionary will include a 'full_content_str' key
		if the file content was successfully read.

		Args:
		    file_path: Path to the file
		    language: Optional language override

		Returns:
		    Structured analysis of the file or an empty dict on failure.
		    Includes 'full_content_str' with the file's decoded content if read.
		"""
		root_node, resolved_language, _, read_content_bytes = self.parse_file(file_path, language)

		if not root_node or not resolved_language:
			return {}

		handler = self.get_syntax_handler(resolved_language)
		if not handler:
			logger.debug("No syntax handler for language %s", resolved_language)
			return {}

		content_bytes_for_analysis = read_content_bytes
		decoded_full_content_str = None

		if content_bytes_for_analysis is None:
			try:
				with file_path.open("rb") as f:
					content_bytes_for_analysis = f.read()
			except Exception:
				logger.exception("Failed to re-read file content for analysis %s", file_path)
				return {}

		if content_bytes_for_analysis is not None:
			try:
				decoded_full_content_str = content_bytes_for_analysis.decode("utf-8", errors="ignore")
			except Exception:
				logger.exception("Failed to decode file content for %s", file_path)
				# Continue without decoded_full_content_str if decoding fails

		# Perform node-level analysis (recursive)
		analysis_data = self.analyze_node(root_node, content_bytes_for_analysis, file_path, resolved_language, handler)

		# Add the full decoded content to the top-level result if available
		if decoded_full_content_str is not None:
			analysis_data["full_content_str"] = decoded_full_content_str

		return analysis_data

__init__

__init__() -> None

Initialize the tree-sitter analyzer.

Source code in src/codemap/processor/tree_sitter/analyzer.py
56
57
58
59
def __init__(self) -> None:
	"""Initialize the tree-sitter analyzer."""
	self.parsers: dict[str, Parser] = {}
	self.ast_cache: dict[Path, tuple[float, Node, Parser]] = {}

parsers instance-attribute

parsers: dict[str, Parser] = {}

ast_cache instance-attribute

ast_cache: dict[Path, tuple[float, Node, Parser]] = {}

get_parser

get_parser(language: str) -> Parser | None

Get the parser for a language, loading it if necessary.

Parameters:

Name Type Description Default
language str

The language to get a parser for

required

Returns:

Type Description
Parser | None

A tree-sitter parser or None if not supported

Source code in src/codemap/processor/tree_sitter/analyzer.py
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
def get_parser(self, language: str) -> Parser | None:
	"""
	Get the parser for a language, loading it if necessary.

	Args:
	    language: The language to get a parser for

	Returns:
	    A tree-sitter parser or None if not supported
	"""
	if language in self.parsers:
		return self.parsers[language]

	# Lazy load the parser
	ts_lang_name = LANGUAGE_NAMES.get(language)
	if not ts_lang_name:
		return None
	try:
		lang_obj: Language = get_language(ts_lang_name)
		parser: Parser = Parser()
		parser.language = lang_obj
		self.parsers[language] = parser
		return parser
	except (ValueError, RuntimeError, ImportError) as e:
		logger.debug("Failed to load language %s: %s", language, str(e))
		return None

parse_file

parse_file(
	file_path: Path, language: str | None = None
) -> tuple[Node | None, str, Parser | None, bytes | None]

Parse a file and return its root node, determined language, the parser used, and content_bytes.

Utilizes a cache to avoid re-parsing unchanged files.

Parameters:

Name Type Description Default
file_path Path

Path to the file to parse

required
language str | None

Optional language override

None

Returns:

Type Description
Node | None

A tuple containing the parse tree root node (or None if parsing failed),

str

the determined language string, the parser instance, and the file content as bytes (if read).

Source code in src/codemap/processor/tree_sitter/analyzer.py
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
def parse_file(
	self, file_path: Path, language: str | None = None
) -> tuple[Node | None, str, Parser | None, bytes | None]:
	"""
	Parse a file and return its root node, determined language, the parser used, and content_bytes.

	Utilizes a cache to avoid re-parsing unchanged files.

	Args:
	    file_path: Path to the file to parse
	    language: Optional language override

	Returns:
	    A tuple containing the parse tree root node (or None if parsing failed),
	    the determined language string, the parser instance, and the file content as bytes (if read).
	"""
	# Determine language if not provided
	determined_language = language
	if not determined_language:
		determined_language = get_language_by_extension(file_path)
		if not determined_language:
			logger.debug("Could not determine language for file %s", file_path)
			return None, "", None, None

	try:
		current_mtime = file_path.stat().st_mtime
		if file_path in self.ast_cache:
			# Assuming cache stores (mtime, ast_root, parser, content_bytes)
			# For now, let's simplify and not cache content_bytes directly with AST to avoid large cache items
			# We will re-read if cache hit for AST, but this is still better than re-parsing.
			# A more advanced cache could handle content_bytes.
			cached_mtime, cached_tree_root, cached_parser = self.ast_cache[file_path]
			if current_mtime == cached_mtime:
				logger.debug("AST cache hit for: %s. Content will be re-read by caller if needed.", file_path)
				# To return content_bytes here, we would need to have cached it or re-read it.
				# For simplicity of this step, parse_file will return None for content_bytes
				# on a pure AST cache hit.
				# The caller (analyze_file) will then read it. The main win (no re-parse) is achieved.
				return cached_tree_root, determined_language, cached_parser, None
	except FileNotFoundError:
		logger.warning("File not found during mtime check: %s", file_path)
		if file_path in self.ast_cache:
			del self.ast_cache[file_path]
		return None, determined_language, None, None

	parser = self.get_parser(determined_language)
	if not parser:
		logger.debug("No parser for language %s", determined_language)
		return None, determined_language, None, None

	content_bytes_read: bytes | None = None  # Initialize here
	try:
		with file_path.open("rb") as f:
			content_bytes_read = f.read()
		tree = parser.parse(content_bytes_read)
		root_node = tree.root_node
		current_mtime_after_read = file_path.stat().st_mtime
		self.ast_cache[file_path] = (
			current_mtime_after_read,
			root_node,
			parser,
		)  # Not caching content_bytes in AST cache for now
		logger.debug("Parsed and cached AST for: %s", file_path)
		return root_node, determined_language, parser, content_bytes_read  # Return read content_bytes
	except FileNotFoundError:
		logger.warning("File not found during parsing: %s", file_path)
		if file_path in self.ast_cache:
			del self.ast_cache[file_path]
		# If file not found, content_bytes_read would not have been assigned (or remains None)
		return None, determined_language, parser, None
	except Exception:
		logger.exception("Failed to parse file %s", file_path)
		# content_bytes_read will be None if open failed, or the read bytes if parse failed
		return None, determined_language, parser, content_bytes_read

get_syntax_handler

get_syntax_handler(
	language: str,
) -> LanguageSyntaxHandler | None

Get the syntax handler for a language.

Parameters:

Name Type Description Default
language str

The language to get a handler for

required

Returns:

Type Description
LanguageSyntaxHandler | None

A syntax handler or None if not supported

Source code in src/codemap/processor/tree_sitter/analyzer.py
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
def get_syntax_handler(self, language: str) -> LanguageSyntaxHandler | None:
	"""
	Get the syntax handler for a language.

	Args:
	    language: The language to get a handler for

	Returns:
	    A syntax handler or None if not supported

	"""
	handler_class = LANGUAGE_HANDLERS.get(language)
	if not handler_class:
		return None
	return handler_class()

analyze_node

analyze_node(
	node: Node,
	content_bytes: bytes,
	file_path: Path,
	language: str,
	handler: LanguageSyntaxHandler,
	parent_node: Node | None = None,
) -> dict

Analyze a tree-sitter node and return structured information.

Parameters:

Name Type Description Default
node Node

The tree-sitter node

required
content_bytes bytes

Source code content as bytes

required
file_path Path

Path to the source file

required
language str

Programming language

required
handler LanguageSyntaxHandler

Language-specific syntax handler

required
parent_node Node | None

Parent node if any

None

Returns:

Type Description
dict

Dict with node analysis information

Source code in src/codemap/processor/tree_sitter/analyzer.py
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
def analyze_node(
	self,
	node: Node,
	content_bytes: bytes,
	file_path: Path,
	language: str,
	handler: LanguageSyntaxHandler,
	parent_node: Node | None = None,
) -> dict:
	"""
	Analyze a tree-sitter node and return structured information.

	Args:
	    node: The tree-sitter node
	    content_bytes: Source code content as bytes
	    file_path: Path to the source file
	    language: Programming language
	    handler: Language-specific syntax handler
	    parent_node: Parent node if any

	Returns:
	    Dict with node analysis information

	"""
	# Check if we should skip this node
	if handler.should_skip_node(node):
		return {}

	# Get entity type for this node from the handler
	entity_type = handler.get_entity_type(node, parent_node, content_bytes)

	# Skip unknown/uninteresting nodes unless they might contain interesting children
	if entity_type == EntityType.UNKNOWN and not node.named_child_count > 0:
		return {}

	# Get name and other metadata
	name = handler.extract_name(node, content_bytes)
	docstring_text, docstring_node = handler.find_docstring(node, content_bytes)

	# Get node content
	try:
		node_content = content_bytes[node.start_byte : node.end_byte].decode("utf-8", errors="ignore")
	except (UnicodeDecodeError, IndexError):
		node_content = ""

	# Extract dependencies from import statements
	dependencies = []
	if entity_type == EntityType.IMPORT:
		try:
			dependencies = handler.extract_imports(node, content_bytes)
		except (AttributeError, UnicodeDecodeError, IndexError, ValueError) as e:
			logger.debug("Failed to extract dependencies: %s", e)

	# Build result
	result = {
		"type": entity_type.name if entity_type != EntityType.UNKNOWN else "UNKNOWN",
		"name": name,
		"location": {
			"start_line": node.start_point[0] + 1,  # Convert to 1-based
			"end_line": node.end_point[0] + 1,
			"start_col": node.start_point[1],
			"end_col": node.end_point[1],
		},
		"docstring": docstring_text,
		"content": node_content,
		"children": [],
		"language": language,
	}

	# Add dependencies only if they exist to keep the output clean
	if dependencies:
		result["dependencies"] = dependencies

	# Extract function calls if the entity is a function or method
	calls = []
	if entity_type in (EntityType.FUNCTION, EntityType.METHOD):
		body_node = handler.get_body_node(node)
		if body_node:
			try:
				calls = handler.extract_calls(body_node, content_bytes)
			except (AttributeError, IndexError, UnicodeDecodeError, ValueError) as e:
				logger.debug("Failed to extract calls for %s: %s", name or "<anonymous>", e)

	# Add calls only if they exist
	if calls:
		result["calls"] = calls

	# Process child nodes
	body_node = handler.get_body_node(node)
	children_to_process = handler.get_children_to_process(node, body_node)

	for child in children_to_process:
		if docstring_node and child == docstring_node:
			continue  # Skip docstring node

		child_result = self.analyze_node(child, content_bytes, file_path, language, handler, node)

		if child_result:  # Only add non-empty results
			result["children"].append(child_result)

	return result

analyze_file

analyze_file(
	file_path: Path, language: str | None = None
) -> dict

Analyze a file and return its structural information.

Uses cached ASTs. The returned dictionary will include a 'full_content_str' key if the file content was successfully read.

Parameters:

Name Type Description Default
file_path Path

Path to the file

required
language str | None

Optional language override

None

Returns:

Type Description
dict

Structured analysis of the file or an empty dict on failure.

dict

Includes 'full_content_str' with the file's decoded content if read.

Source code in src/codemap/processor/tree_sitter/analyzer.py
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
def analyze_file(
	self,
	file_path: Path,
	language: str | None = None,
) -> dict:
	"""
	Analyze a file and return its structural information.

	Uses cached ASTs.
	The returned dictionary will include a 'full_content_str' key
	if the file content was successfully read.

	Args:
	    file_path: Path to the file
	    language: Optional language override

	Returns:
	    Structured analysis of the file or an empty dict on failure.
	    Includes 'full_content_str' with the file's decoded content if read.
	"""
	root_node, resolved_language, _, read_content_bytes = self.parse_file(file_path, language)

	if not root_node or not resolved_language:
		return {}

	handler = self.get_syntax_handler(resolved_language)
	if not handler:
		logger.debug("No syntax handler for language %s", resolved_language)
		return {}

	content_bytes_for_analysis = read_content_bytes
	decoded_full_content_str = None

	if content_bytes_for_analysis is None:
		try:
			with file_path.open("rb") as f:
				content_bytes_for_analysis = f.read()
		except Exception:
			logger.exception("Failed to re-read file content for analysis %s", file_path)
			return {}

	if content_bytes_for_analysis is not None:
		try:
			decoded_full_content_str = content_bytes_for_analysis.decode("utf-8", errors="ignore")
		except Exception:
			logger.exception("Failed to decode file content for %s", file_path)
			# Continue without decoded_full_content_str if decoding fails

	# Perform node-level analysis (recursive)
	analysis_data = self.analyze_node(root_node, content_bytes_for_analysis, file_path, resolved_language, handler)

	# Add the full decoded content to the top-level result if available
	if decoded_full_content_str is not None:
		analysis_data["full_content_str"] = decoded_full_content_str

	return analysis_data