# ---------------------------------------------------------# Copyright (c) Microsoft Corporation. All rights reserved.# Licensed under the MIT License.# ---------------------------------------------------------"""Uses the REST api to perform operations on the search service.see: https://docs.microsoft.com/en-us/rest/api/searchservice/"""importjsonimportosimportpkgutilimportsysimporttimefromitertoolsimportcycleimportrequestsfrom.commonimportDEFAULT_PROJECTIONS_CONTAINER_NAMEAPI_VERSION="?api-version=2019-05-06-Preview"# 15 min scheduleSCHEDULE_INTERVAL="PT15M"
[docs]classGrokRestClient:"""This is a REST client. It is a wrapper around the REST api for the Azure Search Service see: https://docs.microsoft.com/en-us/rest/api/searchservice/ This class can be used to create an indexing pipeline and can be used to run and monitor ongoing indexers. The indexing pipeline can allow you to run batch OCR enrichment of documents. """def__init__(self,cognitive_service_key,search_service_key,search_service_name,skillset_name,index_name,indexer_name,datasource_name,datasource_container_name,blob_account_name,blob_key,projections_container_name=DEFAULT_PROJECTIONS_CONTAINER_NAME,):"""Creates the REST client Args: cognitive_service_key (str): key to cognitive services account search_service_key (str): key to the search service account search_service_name (str): name of the search service account skillset_name (str): name of the skillset index_name (str): name of the index indexer_name (str): the name of indexer datasource_name (str): the name to give the the attached blob storage source datasource_container_name (str): the container in the blob storage that host the files blob_account_name (str): blob storage account name that will host the documents to push though the pipeline blob_key (str): key to blob storage account """# check argumentsself._checkArg("COGNITIVE_SERVICE_KEY",cognitive_service_key)self._checkArg("SEARCH_SERVICE_KEY",search_service_key)self._checkArg("SEARCH_SERVICE_NAME",search_service_name)self._checkArg("SKILLSET_NAME",skillset_name)self._checkArg("INDEX_NAME",index_name)self._checkArg("INDEXER_NAME",indexer_name)self._checkArg("DATASOURCE_NAME",datasource_name)self._checkArg("DATASOURCE_CONTAINER_NAME",datasource_container_name)self._checkArg("PROJECTIONS_CONTAINER_NAME",projections_container_name)self._checkArg("BLOB_NAME",blob_account_name)self._checkArg("BLOB_KEY",blob_key)self.COGNITIVE_SERVICE_KEY=cognitive_service_keyself.SEARCH_SERVICE_KEY=search_service_keyself.SEARCH_SERVICE_NAME=search_service_nameself.SKILLSET_NAME=skillset_nameself.INDEX_NAME=index_nameself.INDEXER_NAME=indexer_nameself.DATASOURCE_NAME=datasource_nameself.DATASOURCE_CONTAINER_NAME=datasource_container_nameself.PROJECTIONS_CONTAINER_NAME=projections_container_nameself.BLOB_NAME=blob_account_nameself.BLOB_KEY=blob_keyself.API_VERSION=API_VERSIONself.BLOB_CONNECTION_STRING=(f"DefaultEndpointsProtocol=https;AccountName={self.BLOB_NAME};"f"AccountKey={self.BLOB_KEY};EndpointSuffix=core.windows.net")@staticmethoddefcreate_from_env_var():COGNITIVE_SERVICE_KEY=os.environ["COGNITIVE_SERVICE_KEY"]SEARCH_SERVICE_KEY=os.environ["SEARCH_SERVICE_KEY"]SEARCH_SERVICE_NAME=os.environ["SEARCH_SERVICE_NAME"]SKILLSET_NAME=os.environ["SKILLSET_NAME"]INDEX_NAME=os.environ["INDEX_NAME"]INDEXER_NAME=os.environ["INDEXER_NAME"]DATASOURCE_NAME=os.environ["DATASOURCE_NAME"]DATASOURCE_CONTAINER_NAME=os.environ["DATASOURCE_CONTAINER_NAME"]BLOB_NAME=os.environ["BLOB_NAME"]BLOB_KEY=os.environ["BLOB_KEY"]PROJECTIONS_CONTAINER_NAME=os.environ.get("PROJECTIONS_CONTAINER_NAME",DEFAULT_PROJECTIONS_CONTAINER_NAME)client=GrokRestClient(COGNITIVE_SERVICE_KEY,SEARCH_SERVICE_KEY,SEARCH_SERVICE_NAME,SKILLSET_NAME,INDEX_NAME,INDEXER_NAME,DATASOURCE_NAME,DATASOURCE_CONTAINER_NAME,BLOB_NAME,BLOB_KEY,projections_container_name=PROJECTIONS_CONTAINER_NAME,)returnclient
[docs]defcreate_skillset(self):"""Adds a skillset that performs OCR on images"""headers={"Content-Type":"application/json","api-key":self.SEARCH_SERVICE_KEY,}skillset_json=json.loads(pkgutil.get_data(__name__,"templates/skillset.json"))skillset_json["name"]=self.SKILLSET_NAMEskillset_json["cognitiveServices"]["key"]=self.COGNITIVE_SERVICE_KEYknowledge_store_json=json.loads(pkgutil.get_data(__name__,"templates/knowledge_store.json"))knowledge_store_json["storageConnectionString"]=self.BLOB_CONNECTION_STRINGknowledge_store_json["projections"][0]["objects"][0]["storageContainer"]=self.PROJECTIONS_CONTAINER_NAMEskillset_json["knowledgeStore"]=knowledge_store_jsonprint(skillset_json)endpoint=f"https://{self.SEARCH_SERVICE_NAME}.search.windows.net/skillsets/{self.SKILLSET_NAME}"r=requests.put(endpoint+self.API_VERSION,json.dumps(skillset_json),headers=headers)print("skillset response",r.text)r.raise_for_status()print("added skillset",self.SKILLSET_NAME,r)
[docs]defcreate_datasource(self):"""Attaches the blob data store to the search service as a source for image documents"""headers={"Content-Type":"application/json","api-key":self.SEARCH_SERVICE_KEY,}datasource_json=json.loads(pkgutil.get_data(__name__,"templates/datasource.json"))datasource_json["name"]=self.DATASOURCE_NAMEdatasource_json["credentials"]["connectionString"]=self.BLOB_CONNECTION_STRINGdatasource_json["type"]="azureblob"datasource_json["container"]["name"]=self.DATASOURCE_CONTAINER_NAMEendpoint=f"https://{self.SEARCH_SERVICE_NAME}.search.windows.net/datasources/{self.DATASOURCE_NAME}"r=requests.put(endpoint+self.API_VERSION,json.dumps(datasource_json),headers=headers)print("datasource response",r.text)r.raise_for_status()print("added datasource",self.DATASOURCE_NAME,r)
[docs]defcreate_index(self):"""Create an index with the layoutText column to store OCR output from the enrichment"""headers={"Content-Type":"application/json","api-key":self.SEARCH_SERVICE_KEY,}index_json=json.loads(pkgutil.get_data(__name__,"templates/index.json"))index_json["name"]=self.INDEX_NAMEendpoint=f"https://{self.SEARCH_SERVICE_NAME}.search.windows.net/indexes/{self.INDEX_NAME}"r=requests.put(endpoint+self.API_VERSION,json.dumps(index_json),headers=headers)print("index response",r.text)r.raise_for_status()print("created index",self.INDEX_NAME,r)
[docs]defcreate_indexer(self,extension_to_exclude=".txt, .json"):"""Creates an indexer that runs the enrichment skillset of documents from the datatsource. The enriched results are pushed to the index. """headers={"Content-Type":"application/json","api-key":self.SEARCH_SERVICE_KEY,}indexer_json=json.loads(pkgutil.get_data(__name__,"templates/indexer.json"))indexer_json["name"]=self.INDEXER_NAMEindexer_json["skillsetName"]=self.SKILLSET_NAMEindexer_json["targetIndexName"]=self.INDEX_NAMEindexer_json["dataSourceName"]=self.DATASOURCE_NAMEindexer_json["schedule"]={"interval":SCHEDULE_INTERVAL}indexer_json["parameters"]["configuration"]["excludedFileNameExtensions"]=extension_to_excludeendpoint=f"https://{self.SEARCH_SERVICE_NAME}.search.windows.net/indexers/{self.INDEXER_NAME}"r=requests.put(endpoint+self.API_VERSION,json.dumps(indexer_json),headers=headers)print("indexer response",r.text)r.raise_for_status()print("created indexer",self.INDEXER_NAME,r)
[docs]defcreate_indexing_pipeline(self):"""Using REST calls, creates an index, indexer and skillset on the Cognitive service. The templates for json are in the templates folder. """self.create_skillset()self.create_index()self.create_datasource()self.create_indexer()
[docs]defdelete_indexer_pipeline(self):"""Deletes all indexers, indexes, skillsets and datasources that had been previously created """headers={"Content-Type":"application/json","api-key":self.SEARCH_SERVICE_KEY,}endpoints=[f"https://{self.SEARCH_SERVICE_NAME}.search.windows.net/indexers/{self.INDEXER_NAME}",f"https://{self.SEARCH_SERVICE_NAME}.search.windows.net/indexes/{self.INDEX_NAME}",f"https://{self.SEARCH_SERVICE_NAME}.search.windows.net/datasources/{self.DATASOURCE_NAME}",f"https://{self.SEARCH_SERVICE_NAME}.search.windows.net/skillsets/{self.SKILLSET_NAME}",]forendpointinendpoints:r=requests.delete(endpoint+self.API_VERSION,headers=headers)print("delete response",r.text)r.raise_for_status()
defrun_indexer(self):headers={"Content-Type":"application/json","api-key":self.SEARCH_SERVICE_KEY,}endpoint=f"https://{self.SEARCH_SERVICE_NAME}.search.windows.net/indexers/{self.INDEXER_NAME}/run"r=requests.post(endpoint+self.API_VERSION,headers=headers)print("run indexer response",r.text)r.raise_for_status()print("running indexer",self.INDEXER_NAME,r)defpoll_indexer_till_complete(self):progress=cycle("|\b/\b-\b\\\b")i=0whileTrue:# attempt a call every 100 stepsifi%100==0:request_json=self.get_indexer_status()ifrequest_json["status"]=="error":raiseRuntimeError("Indexer failed")if(request_json["lastResult"]andnotrequest_json["lastResult"]["status"]=="inProgress"):print(request_json["lastResult"]["status"],self.INDEXER_NAME)returnrequest_jsonsys.stdout.write(next(progress))sys.stdout.flush()time.sleep(0.05)i=(1+i)%1000# to avoid overflowdefget_indexer_status(self):headers={"Content-Type":"application/json","api-key":self.SEARCH_SERVICE_KEY,}endpoint=f"https://{self.SEARCH_SERVICE_NAME}.search.windows.net/indexers/{self.INDEXER_NAME}/status"response=requests.get(endpoint+self.API_VERSION,headers=headers)response.raise_for_status()returnresponse.json()def_checkArg(self,name,value):ifnot(value):raiseValueError(f"argument {name} is not set")