-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathazure_search_skillset.py
107 lines (98 loc) · 4.14 KB
/
azure_search_skillset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
import logging
from azure.search.documents.indexes.models import (
SplitSkill,
InputFieldMappingEntry,
OutputFieldMappingEntry,
AzureOpenAIEmbeddingSkill,
SearchIndexerIndexProjections,
SearchIndexerIndexProjectionSelector,
SearchIndexerIndexProjectionsParameters,
IndexProjectionMode,
SearchIndexerSkillset,
)
from azure.search.documents.indexes import SearchIndexerClient
from ..helpers.config.config_helper import IntegratedVectorizationConfig
from ..helpers.env_helper import EnvHelper
from azure.identity import DefaultAzureCredential
from azure.core.credentials import AzureKeyCredential
logger = logging.getLogger(__name__)
class AzureSearchSkillset:
def __init__(
self,
env_helper: EnvHelper,
integrated_vectorization_config: IntegratedVectorizationConfig,
):
self.env_helper = env_helper
self.indexer_client = SearchIndexerClient(
self.env_helper.AZURE_SEARCH_SERVICE,
(
AzureKeyCredential(self.env_helper.AZURE_SEARCH_KEY)
if self.env_helper.is_auth_type_keys()
else DefaultAzureCredential()
),
)
self.integrated_vectorization_config = integrated_vectorization_config
def create_skillset(self):
skillset_name = f"{self.env_helper.AZURE_SEARCH_INDEX}-skillset"
split_skill = SplitSkill(
description="Split skill to chunk documents",
text_split_mode="pages",
context="/document",
maximum_page_length=self.integrated_vectorization_config.max_page_length,
page_overlap_length=self.integrated_vectorization_config.page_overlap_length,
inputs=[
InputFieldMappingEntry(name="text", source="/document/content"),
],
outputs=[OutputFieldMappingEntry(name="textItems", target_name="pages")],
)
embedding_skill = AzureOpenAIEmbeddingSkill(
description="Skill to generate embeddings via Azure OpenAI",
context="/document/pages/*",
resource_uri=self.env_helper.AZURE_OPENAI_ENDPOINT,
deployment_id=self.env_helper.AZURE_OPENAI_EMBEDDING_MODEL,
api_key=(
self.env_helper.OPENAI_API_KEY
if self.env_helper.is_auth_type_keys()
else None
),
inputs=[
InputFieldMappingEntry(name="text", source="/document/pages/*"),
],
outputs=[
OutputFieldMappingEntry(name="embedding", target_name="content_vector")
],
)
index_projections = SearchIndexerIndexProjections(
selectors=[
SearchIndexerIndexProjectionSelector(
target_index_name=self.env_helper.AZURE_SEARCH_INDEX,
parent_key_field_name="id",
source_context="/document/pages/*",
mappings=[
InputFieldMappingEntry(
name="content", source="/document/pages/*"
),
InputFieldMappingEntry(
name="content_vector",
source="/document/pages/*/content_vector",
),
InputFieldMappingEntry(name="title", source="/document/title"),
InputFieldMappingEntry(
name="source", source="/document/metadata_storage_path"
),
],
),
],
parameters=SearchIndexerIndexProjectionsParameters(
projection_mode=IndexProjectionMode.SKIP_INDEXING_PARENT_DOCUMENTS
),
)
skillset = SearchIndexerSkillset(
name=skillset_name,
description="Skillset to chunk documents and generating embeddings",
skills=[split_skill, embedding_skill],
index_projections=index_projections,
)
skillset_result = self.indexer_client.create_or_update_skillset(skillset)
logger.info(f"{skillset.name} created")
return skillset_result