Skip to content

Splitter

Diff splitting implementation for CodeMap.

logger module-attribute

logger = getLogger(__name__)

MAX_DIFF_CONTENT_LENGTH module-attribute

MAX_DIFF_CONTENT_LENGTH = 100000

MAX_DIFF_LINES module-attribute

MAX_DIFF_LINES = 1000

SMALL_SECTION_SIZE module-attribute

SMALL_SECTION_SIZE = 50

COMPLEX_SECTION_SIZE module-attribute

COMPLEX_SECTION_SIZE = 100

DiffSplitter

Splits Git diffs into logical chunks.

Source code in src/codemap/git/diff_splitter/splitter.py
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
class DiffSplitter:
	"""Splits Git diffs into logical chunks."""

	def __init__(
		self,
		config_loader: "ConfigLoader | None" = None,
	) -> None:
		"""
		Initialize the diff splitter.

		Args:
		    config_loader: ConfigLoader object for loading configuration
		"""
		if config_loader:
			self.config_loader = config_loader
		else:
			from codemap.config import ConfigLoader  # Import locally

			self.config_loader = ConfigLoader.get_instance()

		if self.config_loader.get.repo_root is None:
			self.repo_root = ExtendedGitRepoContext.get_repo_root()
		else:
			self.repo_root = self.config_loader.get.repo_root

		# Get config for diff_splitter, fallback to empty dict if not found
		ds_config = self.config_loader.get.commit.diff_splitter

		# Determine parameters: CLI/direct arg > Config file > DEFAULT_CONFIG
		self.similarity_threshold = ds_config.similarity_threshold
		self.directory_similarity_threshold = ds_config.directory_similarity_threshold
		self.min_chunks_for_consolidation = ds_config.min_chunks_for_consolidation
		self.max_chunks_before_consolidation = ds_config.max_chunks_before_consolidation
		self.max_file_size_for_llm = ds_config.max_file_size_for_llm
		self.max_log_diff_size = ds_config.max_log_diff_size

	async def split_diff(self, diff: GitDiff) -> tuple[list[DiffChunk], list[str]]:
		"""
		Split a diff into logical chunks using semantic splitting.

		Args:
		    diff: GitDiff object to split

		Returns:
		    Tuple of (List of DiffChunk objects based on semantic analysis, List of filtered large files)

		Raises:
		    ValueError: If semantic splitting is not available or fails

		"""
		if not diff.files:
			return [], []

		# Special handling for untracked files - bypass semantic split since the content isn't a proper diff format
		if diff.is_untracked:
			logger.debug("Processing untracked files with special handling: %d files", len(diff.files))
			# Create a simple chunk per file to avoid errors with unidiff parsing
			chunks = []
			for file_path in diff.files:
				# Create a basic chunk with file info but without trying to parse the content as a diff
				chunks = [
					DiffChunk(
						files=[file_path],
						content=f"New untracked file: {file_path}",
						description=f"New file: {file_path}",
					)
					for file_path in diff.files
				]
			return chunks, []

		# In test environments, log the diff content for debugging
		if is_test_environment():
			logger.debug("Processing diff in test environment with %d files", len(diff.files) if diff.files else 0)
			if diff.content and len(diff.content) < self.max_log_diff_size:  # Use configured max log size
				logger.debug("Diff content: %s", diff.content)

		# Process files in the diff
		if diff.files:
			# Filter for valid files (existence, tracked status), max_size check removed here
			logger.debug(f"DiffSplitter.split_diff: Files before filter_valid_files: {diff.files}")
			diff.files, _ = filter_valid_files(diff.files, self.repo_root, is_test_environment())
			logger.debug(f"DiffSplitter.split_diff: Files after filter_valid_files: {diff.files}")
			# filtered_large_files list is no longer populated or used here

		if not diff.files:
			logger.warning("No valid files to process after filtering")
			return [], []  # Return empty lists

		try:
			semantic_strategy = SemanticSplitStrategy(config_loader=self.config_loader)
			chunks = await semantic_strategy.split(diff)

			# If we truncated the content, restore the original content for the actual chunks
			if diff.content and chunks:
				# Create a mapping of file paths to chunks for quick lookup
				chunks_by_file = {}
				for chunk in chunks:
					for file_path in chunk.files:
						if file_path not in chunks_by_file:
							chunks_by_file[file_path] = []
						chunks_by_file[file_path].append(chunk)

				# For chunks that represent files we can find in the original content,
				# update their content to include the full original diff for that file
				for chunk in chunks:
					# Use a heuristic to match file sections in the original content
					for file_path in chunk.files:
						file_marker = f"diff --git a/{file_path} b/{file_path}"
						if file_marker in diff.content:
							# Found a match for this file in the original content
							# Extract that file's complete diff section
							start_idx = diff.content.find(file_marker)
							end_idx = diff.content.find("diff --git", start_idx + len(file_marker))
							if end_idx == -1:  # Last file in the diff
								end_idx = len(diff.content)

							file_diff = diff.content[start_idx:end_idx].strip()

							# Now replace just this file's content in the chunk
							# This is a heuristic that may need adjustment based on your diff format
							if chunk.content and file_marker in chunk.content:
								chunk_start = chunk.content.find(file_marker)
								chunk_end = chunk.content.find("diff --git", chunk_start + len(file_marker))
								if chunk_end == -1:  # Last file in the chunk
									chunk_end = len(chunk.content)

								# Replace this file's truncated diff with the full diff
								chunk.content = chunk.content[:chunk_start] + file_diff + chunk.content[chunk_end:]

			return chunks, []
		except Exception:
			logger.exception("Semantic splitting failed")

			# Try basic splitting as a fallback
			logger.warning("Falling back to basic file splitting")
			# Return empty list for filtered_large_files as it's no longer tracked here
			return await self._create_basic_file_chunk(diff), []

	async def _create_basic_file_chunk(self, diff: GitDiff) -> list[DiffChunk]:
		"""
		Create a basic chunk per file without semantic analysis.

		Args:
		    diff: GitDiff object to split

		Returns:
		    List of DiffChunk objects, one per file

		"""
		chunks = []

		if diff.files:
			# Create a basic chunk, one per file in this strategy, no semantic grouping
			strategy = FileSplitStrategy()
			chunks = await strategy.split(diff)

		return chunks

__init__

__init__(config_loader: ConfigLoader | None = None) -> None

Initialize the diff splitter.

Parameters:

Name Type Description Default
config_loader ConfigLoader | None

ConfigLoader object for loading configuration

None
Source code in src/codemap/git/diff_splitter/splitter.py
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
def __init__(
	self,
	config_loader: "ConfigLoader | None" = None,
) -> None:
	"""
	Initialize the diff splitter.

	Args:
	    config_loader: ConfigLoader object for loading configuration
	"""
	if config_loader:
		self.config_loader = config_loader
	else:
		from codemap.config import ConfigLoader  # Import locally

		self.config_loader = ConfigLoader.get_instance()

	if self.config_loader.get.repo_root is None:
		self.repo_root = ExtendedGitRepoContext.get_repo_root()
	else:
		self.repo_root = self.config_loader.get.repo_root

	# Get config for diff_splitter, fallback to empty dict if not found
	ds_config = self.config_loader.get.commit.diff_splitter

	# Determine parameters: CLI/direct arg > Config file > DEFAULT_CONFIG
	self.similarity_threshold = ds_config.similarity_threshold
	self.directory_similarity_threshold = ds_config.directory_similarity_threshold
	self.min_chunks_for_consolidation = ds_config.min_chunks_for_consolidation
	self.max_chunks_before_consolidation = ds_config.max_chunks_before_consolidation
	self.max_file_size_for_llm = ds_config.max_file_size_for_llm
	self.max_log_diff_size = ds_config.max_log_diff_size

config_loader instance-attribute

config_loader = config_loader

repo_root instance-attribute

repo_root = get_repo_root()

similarity_threshold instance-attribute

similarity_threshold = similarity_threshold

directory_similarity_threshold instance-attribute

directory_similarity_threshold = (
	directory_similarity_threshold
)

min_chunks_for_consolidation instance-attribute

min_chunks_for_consolidation = min_chunks_for_consolidation

max_chunks_before_consolidation instance-attribute

max_chunks_before_consolidation = (
	max_chunks_before_consolidation
)

max_file_size_for_llm instance-attribute

max_file_size_for_llm = max_file_size_for_llm

max_log_diff_size instance-attribute

max_log_diff_size = max_log_diff_size

split_diff async

split_diff(
	diff: GitDiff,
) -> tuple[list[DiffChunk], list[str]]

Split a diff into logical chunks using semantic splitting.

Parameters:

Name Type Description Default
diff GitDiff

GitDiff object to split

required

Returns:

Type Description
tuple[list[DiffChunk], list[str]]

Tuple of (List of DiffChunk objects based on semantic analysis, List of filtered large files)

Raises:

Type Description
ValueError

If semantic splitting is not available or fails

Source code in src/codemap/git/diff_splitter/splitter.py
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
async def split_diff(self, diff: GitDiff) -> tuple[list[DiffChunk], list[str]]:
	"""
	Split a diff into logical chunks using semantic splitting.

	Args:
	    diff: GitDiff object to split

	Returns:
	    Tuple of (List of DiffChunk objects based on semantic analysis, List of filtered large files)

	Raises:
	    ValueError: If semantic splitting is not available or fails

	"""
	if not diff.files:
		return [], []

	# Special handling for untracked files - bypass semantic split since the content isn't a proper diff format
	if diff.is_untracked:
		logger.debug("Processing untracked files with special handling: %d files", len(diff.files))
		# Create a simple chunk per file to avoid errors with unidiff parsing
		chunks = []
		for file_path in diff.files:
			# Create a basic chunk with file info but without trying to parse the content as a diff
			chunks = [
				DiffChunk(
					files=[file_path],
					content=f"New untracked file: {file_path}",
					description=f"New file: {file_path}",
				)
				for file_path in diff.files
			]
		return chunks, []

	# In test environments, log the diff content for debugging
	if is_test_environment():
		logger.debug("Processing diff in test environment with %d files", len(diff.files) if diff.files else 0)
		if diff.content and len(diff.content) < self.max_log_diff_size:  # Use configured max log size
			logger.debug("Diff content: %s", diff.content)

	# Process files in the diff
	if diff.files:
		# Filter for valid files (existence, tracked status), max_size check removed here
		logger.debug(f"DiffSplitter.split_diff: Files before filter_valid_files: {diff.files}")
		diff.files, _ = filter_valid_files(diff.files, self.repo_root, is_test_environment())
		logger.debug(f"DiffSplitter.split_diff: Files after filter_valid_files: {diff.files}")
		# filtered_large_files list is no longer populated or used here

	if not diff.files:
		logger.warning("No valid files to process after filtering")
		return [], []  # Return empty lists

	try:
		semantic_strategy = SemanticSplitStrategy(config_loader=self.config_loader)
		chunks = await semantic_strategy.split(diff)

		# If we truncated the content, restore the original content for the actual chunks
		if diff.content and chunks:
			# Create a mapping of file paths to chunks for quick lookup
			chunks_by_file = {}
			for chunk in chunks:
				for file_path in chunk.files:
					if file_path not in chunks_by_file:
						chunks_by_file[file_path] = []
					chunks_by_file[file_path].append(chunk)

			# For chunks that represent files we can find in the original content,
			# update their content to include the full original diff for that file
			for chunk in chunks:
				# Use a heuristic to match file sections in the original content
				for file_path in chunk.files:
					file_marker = f"diff --git a/{file_path} b/{file_path}"
					if file_marker in diff.content:
						# Found a match for this file in the original content
						# Extract that file's complete diff section
						start_idx = diff.content.find(file_marker)
						end_idx = diff.content.find("diff --git", start_idx + len(file_marker))
						if end_idx == -1:  # Last file in the diff
							end_idx = len(diff.content)

						file_diff = diff.content[start_idx:end_idx].strip()

						# Now replace just this file's content in the chunk
						# This is a heuristic that may need adjustment based on your diff format
						if chunk.content and file_marker in chunk.content:
							chunk_start = chunk.content.find(file_marker)
							chunk_end = chunk.content.find("diff --git", chunk_start + len(file_marker))
							if chunk_end == -1:  # Last file in the chunk
								chunk_end = len(chunk.content)

							# Replace this file's truncated diff with the full diff
							chunk.content = chunk.content[:chunk_start] + file_diff + chunk.content[chunk_end:]

		return chunks, []
	except Exception:
		logger.exception("Semantic splitting failed")

		# Try basic splitting as a fallback
		logger.warning("Falling back to basic file splitting")
		# Return empty list for filtered_large_files as it's no longer tracked here
		return await self._create_basic_file_chunk(diff), []