Skip to content

Embedder

Module for generating embeddings from diff chunks.

logger module-attribute

logger = getLogger(__name__)

DiffEmbedder

Generates embeddings for diff chunks.

Source code in src/codemap/git/semantic_grouping/embedder.py
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
class DiffEmbedder:
	"""Generates embeddings for diff chunks."""

	def __init__(
		self,
		config_loader: "ConfigLoader",
	) -> None:
		"""
		Initialize the embedder with configuration.

		Args:
		    config_loader: ConfigLoader instance for embedding configuration.
		"""
		self.config_loader = config_loader

	def preprocess_diff(self, diff_text: str) -> str:
		"""
		Preprocess diff text to make it more suitable for embedding.

		Args:
		    diff_text: Raw diff text

		Returns:
		    Preprocessed text

		"""
		# Remove diff headers, line numbers, etc.
		# Focus on actual content changes
		lines = []
		for line in diff_text.splitlines():
			# Skip diff metadata lines
			if line.startswith(("diff --git", "index ", "+++", "---")):
				continue

			# Keep actual content changes, removing the +/- prefix
			if line.startswith(("+", "-", " ")):
				lines.append(line[1:])

		return "\n".join(lines)

	async def embed_chunk(self, chunk: DiffChunk) -> np.ndarray:
		"""
		Generate an embedding for a diff chunk using Voyage AI.

		Args:
		    chunk: DiffChunk object

		Returns:
		    numpy.ndarray: Embedding vector

		"""
		# Get the diff content from the chunk
		diff_text = chunk.content

		# Preprocess the diff text
		processed_text = self.preprocess_diff(diff_text)

		# If the processed text is empty, use the file paths as context
		if not processed_text.strip():
			processed_text = " ".join(chunk.files)

		# Generate embeddings in batch (of 1)
		embeddings = generate_embedding([processed_text], self.config_loader)

		if not embeddings:
			message = f"Failed to generate embedding for chunk with files: {', '.join(chunk.files)}"
			logger.error(message)
			# Return a zero vector as a fallback
			return np.zeros(1024)  # Using default dimension of 1024

		return np.array(embeddings[0])

	async def embed_contents(self, contents: list[str]) -> list[float | None]:
		"""
		Generate embeddings for multiple content strings.

		Args:
		    contents: List of text content strings to embed

		Returns:
		    List of embedding vectors or None for each content
		"""
		# Filter out empty contents
		contents_to_embed = []
		valid_indices = []

		for i, content in enumerate(contents):
			if content and content.strip():
				# Preprocess if it looks like diff content
				if content.startswith(("diff --git", "+", "-", " ")):
					processed = self.preprocess_diff(content)
					if processed.strip():
						contents_to_embed.append(processed)
						valid_indices.append(i)
				else:
					# Use as-is if it doesn't look like a diff
					contents_to_embed.append(content)
					valid_indices.append(i)

		# Return early if no valid contents
		if not contents_to_embed:
			return [None] * len(contents)

		# Generate embeddings in batch
		try:
			embeddings_batch = generate_embedding(contents_to_embed, self.config_loader)

			# Rebuild result list with None for invalid contents
			result: list[float | None] = [None] * len(contents)
			if embeddings_batch:
				for idx, valid_idx in enumerate(valid_indices):
					if idx < len(embeddings_batch):
						result[valid_idx] = embeddings_batch[idx]
			return result

		except Exception:
			logger.exception("Unexpected error during embedding generation")
			return [None] * len(contents)

	async def embed_chunks(self, chunks: list[DiffChunk]) -> list[tuple[DiffChunk, np.ndarray]]:
		"""
		Generate embeddings for multiple chunks using efficient batch processing.

		Args:
		    chunks: List of DiffChunk objects

		Returns:
		    List of (chunk, embedding) tuples

		"""
		if not chunks:
			return []

		# Preprocess all chunk texts
		preprocessed_texts = []
		for chunk in chunks:
			diff_text = chunk.content
			processed_text = self.preprocess_diff(diff_text)

			# If the processed text is empty, use the file paths as context
			if not processed_text.strip():
				processed_text = " ".join(chunk.files)

			preprocessed_texts.append(processed_text)

		# Generate embeddings in batch
		embeddings = generate_embedding(preprocessed_texts, self.config_loader)

		# Create result tuples
		result = []
		if embeddings:
			for i, chunk in enumerate(chunks):
				if i < len(embeddings):
					embedding = np.array(embeddings[i])
				else:
					logger.error(f"Missing embedding for chunk with files: {', '.join(chunk.files)}")
					embedding = np.zeros(1024)  # Fallback
				result.append((chunk, embedding))
		else:
			# Fallback if batch embedding failed
			logger.error("Batch embedding generation failed, using fallback zeros")
			result.extend((chunk, np.zeros(1024)) for chunk in chunks)

		return result

__init__

__init__(config_loader: ConfigLoader) -> None

Initialize the embedder with configuration.

Parameters:

Name Type Description Default
config_loader ConfigLoader

ConfigLoader instance for embedding configuration.

required
Source code in src/codemap/git/semantic_grouping/embedder.py
20
21
22
23
24
25
26
27
28
29
30
def __init__(
	self,
	config_loader: "ConfigLoader",
) -> None:
	"""
	Initialize the embedder with configuration.

	Args:
	    config_loader: ConfigLoader instance for embedding configuration.
	"""
	self.config_loader = config_loader

config_loader instance-attribute

config_loader = config_loader

preprocess_diff

preprocess_diff(diff_text: str) -> str

Preprocess diff text to make it more suitable for embedding.

Parameters:

Name Type Description Default
diff_text str

Raw diff text

required

Returns:

Type Description
str

Preprocessed text

Source code in src/codemap/git/semantic_grouping/embedder.py
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
def preprocess_diff(self, diff_text: str) -> str:
	"""
	Preprocess diff text to make it more suitable for embedding.

	Args:
	    diff_text: Raw diff text

	Returns:
	    Preprocessed text

	"""
	# Remove diff headers, line numbers, etc.
	# Focus on actual content changes
	lines = []
	for line in diff_text.splitlines():
		# Skip diff metadata lines
		if line.startswith(("diff --git", "index ", "+++", "---")):
			continue

		# Keep actual content changes, removing the +/- prefix
		if line.startswith(("+", "-", " ")):
			lines.append(line[1:])

	return "\n".join(lines)

embed_chunk async

embed_chunk(chunk: DiffChunk) -> ndarray

Generate an embedding for a diff chunk using Voyage AI.

Parameters:

Name Type Description Default
chunk DiffChunk

DiffChunk object

required

Returns:

Type Description
ndarray

numpy.ndarray: Embedding vector

Source code in src/codemap/git/semantic_grouping/embedder.py
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
async def embed_chunk(self, chunk: DiffChunk) -> np.ndarray:
	"""
	Generate an embedding for a diff chunk using Voyage AI.

	Args:
	    chunk: DiffChunk object

	Returns:
	    numpy.ndarray: Embedding vector

	"""
	# Get the diff content from the chunk
	diff_text = chunk.content

	# Preprocess the diff text
	processed_text = self.preprocess_diff(diff_text)

	# If the processed text is empty, use the file paths as context
	if not processed_text.strip():
		processed_text = " ".join(chunk.files)

	# Generate embeddings in batch (of 1)
	embeddings = generate_embedding([processed_text], self.config_loader)

	if not embeddings:
		message = f"Failed to generate embedding for chunk with files: {', '.join(chunk.files)}"
		logger.error(message)
		# Return a zero vector as a fallback
		return np.zeros(1024)  # Using default dimension of 1024

	return np.array(embeddings[0])

embed_contents async

embed_contents(contents: list[str]) -> list[float | None]

Generate embeddings for multiple content strings.

Parameters:

Name Type Description Default
contents list[str]

List of text content strings to embed

required

Returns:

Type Description
list[float | None]

List of embedding vectors or None for each content

Source code in src/codemap/git/semantic_grouping/embedder.py
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
async def embed_contents(self, contents: list[str]) -> list[float | None]:
	"""
	Generate embeddings for multiple content strings.

	Args:
	    contents: List of text content strings to embed

	Returns:
	    List of embedding vectors or None for each content
	"""
	# Filter out empty contents
	contents_to_embed = []
	valid_indices = []

	for i, content in enumerate(contents):
		if content and content.strip():
			# Preprocess if it looks like diff content
			if content.startswith(("diff --git", "+", "-", " ")):
				processed = self.preprocess_diff(content)
				if processed.strip():
					contents_to_embed.append(processed)
					valid_indices.append(i)
			else:
				# Use as-is if it doesn't look like a diff
				contents_to_embed.append(content)
				valid_indices.append(i)

	# Return early if no valid contents
	if not contents_to_embed:
		return [None] * len(contents)

	# Generate embeddings in batch
	try:
		embeddings_batch = generate_embedding(contents_to_embed, self.config_loader)

		# Rebuild result list with None for invalid contents
		result: list[float | None] = [None] * len(contents)
		if embeddings_batch:
			for idx, valid_idx in enumerate(valid_indices):
				if idx < len(embeddings_batch):
					result[valid_idx] = embeddings_batch[idx]
		return result

	except Exception:
		logger.exception("Unexpected error during embedding generation")
		return [None] * len(contents)

embed_chunks async

embed_chunks(
	chunks: list[DiffChunk],
) -> list[tuple[DiffChunk, ndarray]]

Generate embeddings for multiple chunks using efficient batch processing.

Parameters:

Name Type Description Default
chunks list[DiffChunk]

List of DiffChunk objects

required

Returns:

Type Description
list[tuple[DiffChunk, ndarray]]

List of (chunk, embedding) tuples

Source code in src/codemap/git/semantic_grouping/embedder.py
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
async def embed_chunks(self, chunks: list[DiffChunk]) -> list[tuple[DiffChunk, np.ndarray]]:
	"""
	Generate embeddings for multiple chunks using efficient batch processing.

	Args:
	    chunks: List of DiffChunk objects

	Returns:
	    List of (chunk, embedding) tuples

	"""
	if not chunks:
		return []

	# Preprocess all chunk texts
	preprocessed_texts = []
	for chunk in chunks:
		diff_text = chunk.content
		processed_text = self.preprocess_diff(diff_text)

		# If the processed text is empty, use the file paths as context
		if not processed_text.strip():
			processed_text = " ".join(chunk.files)

		preprocessed_texts.append(processed_text)

	# Generate embeddings in batch
	embeddings = generate_embedding(preprocessed_texts, self.config_loader)

	# Create result tuples
	result = []
	if embeddings:
		for i, chunk in enumerate(chunks):
			if i < len(embeddings):
				embedding = np.array(embeddings[i])
			else:
				logger.error(f"Missing embedding for chunk with files: {', '.join(chunk.files)}")
				embedding = np.zeros(1024)  # Fallback
			result.append((chunk, embedding))
	else:
		# Fallback if batch embedding failed
		logger.error("Batch embedding generation failed, using fallback zeros")
		result.extend((chunk, np.zeros(1024)) for chunk in chunks)

	return result