Skip to main content
Version: 0.10.1

Text Analytics

EntityDetector

from synapse.ml.cognitive import *

textKey = os.environ.get("COGNITIVE_API_KEY", getSecret("cognitive-api-key"))
df = spark.createDataFrame([
("1", "Microsoft released Windows 10"),
("2", "In 1975, Bill Gates III and Paul Allen founded the company.")
], ["id", "text"])

entity = (EntityDetector()
.setSubscriptionKey(textKey)
.setLocation("eastus")
.setLanguage("en")
.setOutputCol("replies")
.setErrorCol("error"))

entity.transform(df).show()
Python API: EntityDetectorScala API: EntityDetector.NET API: EntityDetectorSource: EntityDetector

KeyPhraseExtractor

from synapse.ml.cognitive import *

textKey = os.environ.get("COGNITIVE_API_KEY", getSecret("cognitive-api-key"))
df = spark.createDataFrame([
("en", "Hello world. This is some input text that I love."),
("fr", "Bonjour tout le monde"),
("es", "La carretera estaba atascada. Había mucho tráfico el día de ayer.")
], ["lang", "text"])

keyPhrase = (KeyPhraseExtractor()
.setSubscriptionKey(textKey)
.setLocation("eastus")
.setLanguageCol("lang")
.setOutputCol("replies")
.setErrorCol("error"))

keyPhrase.transform(df).show()
Python API: KeyPhraseExtractorScala API: KeyPhraseExtractor.NET API: KeyPhraseExtractorSource: KeyPhraseExtractor

LanguageDetector

from synapse.ml.cognitive import *

textKey = os.environ.get("COGNITIVE_API_KEY", getSecret("cognitive-api-key"))
df = spark.createDataFrame([
("Hello World",),
("Bonjour tout le monde",),
("La carretera estaba atascada. Había mucho tráfico el día de ayer.",),
("你好",),
("こんにちは",),
(":) :( :D",)
], ["text",])

language = (LanguageDetector()
.setSubscriptionKey(textKey)
.setLocation("eastus")
.setTextCol("text")
.setOutputCol("language")
.setErrorCol("error"))

language.transform(df).show()
Python API: LanguageDetectorScala API: LanguageDetector.NET API: LanguageDetectorSource: LanguageDetector

NER

from synapse.ml.cognitive import *

textKey = os.environ.get("COGNITIVE_API_KEY", getSecret("cognitive-api-key"))
df = spark.createDataFrame([
("1", "en", "I had a wonderful trip to Seattle last week."),
("2", "en", "I visited Space Needle 2 times.")
], ["id", "language", "text"])

ner = (NER()
.setSubscriptionKey(textKey)
.setLocation("eastus")
.setLanguageCol("language")
.setOutputCol("replies")
.setErrorCol("error"))

ner.transform(df).show()
Python API: NERScala API: NER.NET API: NERSource: NER

PII

from synapse.ml.cognitive import *

textKey = os.environ.get("COGNITIVE_API_KEY", getSecret("cognitive-api-key"))
df = spark.createDataFrame([
("1", "en", "My SSN is 859-98-0987"),
("2", "en",
"Your ABA number - 111000025 - is the first 9 digits in the lower left hand corner of your personal check."),
("3", "en", "Is 998.214.865-68 your Brazilian CPF number?")
], ["id", "language", "text"])

pii = (PII()
.setSubscriptionKey(textKey)
.setLocation("eastus")
.setLanguage("en")
.setOutputCol("response"))

pii.transform(df).show()
Python API: PIIScala API: PII.NET API: PIISource: PII

TextSentiment

from synapse.ml.cognitive import *

textKey = os.environ.get("COGNITIVE_API_KEY", getSecret("cognitive-api-key"))
df = spark.createDataFrame([
("I am so happy today, its sunny!", "en-US"),
("I am frustrated by this rush hour traffic", "en-US"),
("The cognitive services on spark aint bad", "en-US"),
], ["text", "language"])

sentiment = (TextSentiment()
.setSubscriptionKey(textKey)
.setLocation("eastus")
.setTextCol("text")
.setOutputCol("sentiment")
.setErrorCol("error")
.setLanguageCol("language"))

sentiment.transform(df).show()
Python API: TextSentimentScala API: TextSentiment.NET API: TextSentimentSource: TextSentiment

Translator

Translate

from synapse.ml.cognitive import *

translatorKey = os.environ.get("TRANSLATOR_KEY", getSecret("translator-key"))
df = spark.createDataFrame([
(["Hello, what is your name?", "Bye"],)
], ["text",])

translate = (Translate()
.setSubscriptionKey(translatorKey)
.setLocation("eastus")
.setTextCol("text")
.setToLanguage(["zh-Hans", "fr"])
.setOutputCol("translation")
.setConcurrency(5))

(translate
.transform(df)
.withColumn("translation", flatten(col("translation.translations")))
.withColumn("translation", col("translation.text"))
.select("translation")).show()
Python API: TranslateScala API: Translate.NET API: TranslateSource: Translate

Transliterate

from synapse.ml.cognitive import *

translatorKey = os.environ.get("TRANSLATOR_KEY", getSecret("translator-key"))
df = spark.createDataFrame([
(["こんにちは", "さようなら"],)
], ["text",])

transliterate = (Transliterate()
.setSubscriptionKey(translatorKey)
.setLocation("eastus")
.setLanguage("ja")
.setFromScript("Jpan")
.setToScript("Latn")
.setTextCol("text")
.setOutputCol("result"))

(transliterate
.transform(df)
.withColumn("text", col("result.text"))
.withColumn("script", col("result.script"))
.select("text", "script")).show()
Python API: TransliterateScala API: Transliterate.NET API: TransliterateSource: Transliterate

Detect

from synapse.ml.cognitive import *

translatorKey = os.environ.get("TRANSLATOR_KEY", getSecret("translator-key"))
df = spark.createDataFrame([
(["Hello, what is your name?"],)
], ["text",])

detect = (Detect()
.setSubscriptionKey(translatorKey)
.setLocation("eastus")
.setTextCol("text")
.setOutputCol("result"))

(detect
.transform(df)
.withColumn("language", col("result.language"))
.select("language")).show()
Python API: DetectScala API: Detect.NET API: DetectSource: Detect

BreakSentence

from synapse.ml.cognitive import *

translatorKey = os.environ.get("TRANSLATOR_KEY", getSecret("translator-key"))
df = spark.createDataFrame([
(["Hello, what is your name?"],)
], ["text",])

breakSentence = (BreakSentence()
.setSubscriptionKey(translatorKey)
.setLocation("eastus")
.setTextCol("text")
.setOutputCol("result"))

(breakSentence
.transform(df)
.withColumn("sentLen", flatten(col("result.sentLen")))
.select("sentLen")).show()
Python API: BreakSentenceScala API: BreakSentence.NET API: BreakSentenceSource: BreakSentence

DictionaryLookup

from synapse.ml.cognitive import *

translatorKey = os.environ.get("TRANSLATOR_KEY", getSecret("translator-key"))
df = spark.createDataFrame([
(["fly"],)
], ["text",])

dictionaryLookup = (DictionaryLookup()
.setSubscriptionKey(translatorKey)
.setLocation("eastus")
.setFromLanguage("en")
.setToLanguage("es")
.setTextCol("text")
.setOutputCol("result"))

(dictionaryLookup
.transform(df)
.withColumn("translations", flatten(col("result.translations")))
.withColumn("normalizedTarget", col("translations.normalizedTarget"))
.select("normalizedTarget")).show()
Python API: DictionaryLookupScala API: DictionaryLookup.NET API: DictionaryLookupSource: DictionaryLookup

DictionaryExamples

from synapse.ml.cognitive import *

translatorKey = os.environ.get("TRANSLATOR_KEY", getSecret("translator-key"))
df = (spark.createDataFrame([
("fly", "volar")
], ["text", "translation"])
.withColumn("textAndTranslation", array(struct(col("text"), col("translation")))))

dictionaryExamples = (DictionaryExamples()
.setSubscriptionKey(translatorKey)
.setLocation("eastus")
.setFromLanguage("en")
.setToLanguage("es")
.setTextAndTranslationCol("textAndTranslation")
.setOutputCol("result"))

(dictionaryExamples
.transform(df)
.withColumn("examples", flatten(col("result.examples")))
.select("examples")).show()
Python API: DictionaryExamplesScala API: DictionaryExamples.NET API: DictionaryExamplesSource: DictionaryExamples

DocumentTranslator

from synapse.ml.cognitive import *

translatorKey = os.environ.get("TRANSLATOR_KEY", getSecret("translator-key"))
translatorName = os.environ.get("TRANSLATOR_NAME", "mmlspark-translator")

documentTranslator = (DocumentTranslator()
.setSubscriptionKey(translatorKey)
.setServiceName(translatorName)
.setSourceUrlCol("sourceUrl")
.setTargetsCol("targets")
.setOutputCol("translationStatus"))
Python API: DocumentTranslatorScala API: DocumentTranslator.NET API: DocumentTranslatorSource: DocumentTranslator

Computer Vision

OCR

from synapse.ml.cognitive import *

cognitiveKey = os.environ.get("COGNITIVE_API_KEY", getSecret("cognitive-api-key"))

df = spark.createDataFrame([
("https://mmlspark.blob.core.windows.net/datasets/OCR/test1.jpg", ),
], ["url", ])

ocr = (OCR()
.setSubscriptionKey(cognitiveKey)
.setLocation("eastus")
.setImageUrlCol("url")
.setDetectOrientation(True)
.setOutputCol("ocr"))

ocr.transform(df).show()
Python API: OCRScala API: OCR.NET API: OCRSource: OCR

AnalyzeImage

from synapse.ml.cognitive import *

cognitiveKey = os.environ.get("COGNITIVE_API_KEY", getSecret("cognitive-api-key"))
df = spark.createDataFrame([
("https://mmlspark.blob.core.windows.net/datasets/OCR/test1.jpg", "en"),
("https://mmlspark.blob.core.windows.net/datasets/OCR/test2.png", None),
("https://mmlspark.blob.core.windows.net/datasets/OCR/test3.png", "en")
], ["image", "language"])


ai = (AnalyzeImage()
.setSubscriptionKey(cognitiveKey)
.setLocation("eastus")
.setImageUrlCol("image")
.setLanguageCol("language")
.setVisualFeatures(["Categories", "Tags", "Description", "Faces", "ImageType", "Color", "Adult", "Objects", "Brands"])
.setDetails(["Celebrities", "Landmarks"])
.setOutputCol("features"))

ai.transform(df).show()
Python API: AnalyzeImageScala API: AnalyzeImage.NET API: AnalyzeImageSource: AnalyzeImage

RecognizeText

from synapse.ml.cognitive import *

cognitiveKey = os.environ.get("COGNITIVE_API_KEY", getSecret("cognitive-api-key"))
df = spark.createDataFrame([
("https://mmlspark.blob.core.windows.net/datasets/OCR/test1.jpg", ),
("https://mmlspark.blob.core.windows.net/datasets/OCR/test2.png", ),
("https://mmlspark.blob.core.windows.net/datasets/OCR/test3.png", )
], ["url", ])

rt = (RecognizeText()
.setSubscriptionKey(cognitiveKey)
.setLocation("eastus")
.setImageUrlCol("url")
.setMode("Printed")
.setOutputCol("ocr")
.setConcurrency(5))

rt.transform(df).show()
Python API: RecognizeTextScala API: RecognizeText.NET API: RecognizeTextSource: RecognizeText

ReadImage

from synapse.ml.cognitive import *

cognitiveKey = os.environ.get("COGNITIVE_API_KEY", getSecret("cognitive-api-key"))
df = spark.createDataFrame([
("https://mmlspark.blob.core.windows.net/datasets/OCR/test1.jpg", ),
("https://mmlspark.blob.core.windows.net/datasets/OCR/test2.png", ),
("https://mmlspark.blob.core.windows.net/datasets/OCR/test3.png", )
], ["url", ])

ri = (ReadImage()
.setSubscriptionKey(cognitiveKey)
.setLocation("eastus")
.setImageUrlCol("url")
.setOutputCol("ocr")
.setConcurrency(5))

ri.transform(df).show()
Python API: ReadImageScala API: ReadImage.NET API: ReadImageSource: ReadImage

RecognizeDomainSpecificContent

from synapse.ml.cognitive import *

cognitiveKey = os.environ.get("COGNITIVE_API_KEY", getSecret("cognitive-api-key"))
df = spark.createDataFrame([
("https://mmlspark.blob.core.windows.net/datasets/DSIR/test2.jpg", )
], ["url", ])

celeb = (RecognizeDomainSpecificContent()
.setSubscriptionKey(cognitiveKey)
.setModel("celebrities")
.setLocation("eastus")
.setImageUrlCol("url")
.setOutputCol("celebs"))

celeb.transform(df).show()
Python API: RecognizeDomainSpecificContentScala API: RecognizeDomainSpecificContent.NET API: RecognizeDomainSpecificContentSource: RecognizeDomainSpecificContent

GenerateThumbnails

from synapse.ml.cognitive import *

cognitiveKey = os.environ.get("COGNITIVE_API_KEY", getSecret("cognitive-api-key"))
df = spark.createDataFrame([
("https://mmlspark.blob.core.windows.net/datasets/DSIR/test1.jpg", )
], ["url", ])

gt = (GenerateThumbnails()
.setSubscriptionKey(cognitiveKey)
.setLocation("eastus")
.setHeight(50)
.setWidth(50)
.setSmartCropping(True)
.setImageUrlCol("url")
.setOutputCol("thumbnails"))

gt.transform(df).show()
Python API: GenerateThumbnailsScala API: GenerateThumbnails.NET API: GenerateThumbnailsSource: GenerateThumbnails

TagImage

from synapse.ml.cognitive import *

cognitiveKey = os.environ.get("COGNITIVE_API_KEY", getSecret("cognitive-api-key"))
df = spark.createDataFrame([
("https://mmlspark.blob.core.windows.net/datasets/DSIR/test1.jpg", )
], ["url", ])

ti = (TagImage()
.setSubscriptionKey(cognitiveKey)
.setLocation("eastus")
.setImageUrlCol("url")
.setOutputCol("tags"))

ti.transform(df).show()
Python API: TagImageScala API: TagImage.NET API: TagImageSource: TagImage

DescribeImage

from synapse.ml.cognitive import *

cognitiveKey = os.environ.get("COGNITIVE_API_KEY", getSecret("cognitive-api-key"))
df = spark.createDataFrame([
("https://mmlspark.blob.core.windows.net/datasets/DSIR/test1.jpg", )
], ["url", ])

di = (DescribeImage()
.setSubscriptionKey(cognitiveKey)
.setLocation("eastus")
.setMaxCandidates(3)
.setImageUrlCol("url")
.setOutputCol("descriptions"))

di.transform(df).show()
Python API: DescribeImageScala API: DescribeImage.NET API: DescribeImageSource: DescribeImage

Form Recognizer

AnalyzeLayout

from synapse.ml.cognitive import *

cognitiveKey = os.environ.get("COGNITIVE_API_KEY", getSecret("cognitive-api-key"))
imageDf = spark.createDataFrame([
("https://mmlspark.blob.core.windows.net/datasets/FormRecognizer/layout1.jpg",)
], ["source",])

analyzeLayout = (AnalyzeLayout()
.setSubscriptionKey(cognitiveKey)
.setLocation("eastus")
.setImageUrlCol("source")
.setOutputCol("layout")
.setConcurrency(5))

(analyzeLayout.transform(imageDf)
.withColumn("lines", flatten(col("layout.analyzeResult.readResults.lines")))
.withColumn("readLayout", col("lines.text"))
.withColumn("tables", flatten(col("layout.analyzeResult.pageResults.tables")))
.withColumn("cells", flatten(col("tables.cells")))
.withColumn("pageLayout", col("cells.text"))
.select("source", "readLayout", "pageLayout")).show()
Python API: AnalyzeLayoutScala API: AnalyzeLayout.NET API: AnalyzeLayoutSource: AnalyzeLayout

AnalyzeReceipts

from synapse.ml.cognitive import *

cognitiveKey = os.environ.get("COGNITIVE_API_KEY", getSecret("cognitive-api-key"))
imageDf = spark.createDataFrame([
("https://mmlspark.blob.core.windows.net/datasets/FormRecognizer/receipt1.png",),
("https://mmlspark.blob.core.windows.net/datasets/FormRecognizer/receipt1.png",)
], ["image",])

analyzeReceipts = (AnalyzeReceipts()
.setSubscriptionKey(cognitiveKey)
.setLocation("eastus")
.setImageUrlCol("image")
.setOutputCol("receipts")
.setConcurrency(5))

analyzeReceipts.transform(imageDf).show()
Python API: AnalyzeReceiptsScala API: AnalyzeReceipts.NET API: AnalyzeReceiptsSource: AnalyzeReceipts

AnalyzeBusinessCards

from synapse.ml.cognitive import *

cognitiveKey = os.environ.get("COGNITIVE_API_KEY", getSecret("cognitive-api-key"))
imageDf = spark.createDataFrame([
("https://mmlspark.blob.core.windows.net/datasets/FormRecognizer/business_card.jpg",)
], ["source",])

analyzeBusinessCards = (AnalyzeBusinessCards()
.setSubscriptionKey(cognitiveKey)
.setLocation("eastus")
.setImageUrlCol("source")
.setOutputCol("businessCards")
.setConcurrency(5))

analyzeBusinessCards.transform(imageDf).show()
Python API: AnalyzeBusinessCardsScala API: AnalyzeBusinessCards.NET API: AnalyzeBusinessCardsSource: AnalyzeBusinessCards

AnalyzeInvoices

from synapse.ml.cognitive import *

cognitiveKey = os.environ.get("COGNITIVE_API_KEY", getSecret("cognitive-api-key"))
imageDf = spark.createDataFrame([
("https://mmlspark.blob.core.windows.net/datasets/FormRecognizer/invoice2.png",)
], ["source",])

analyzeInvoices = (AnalyzeInvoices()
.setSubscriptionKey(cognitiveKey)
.setLocation("eastus")
.setImageUrlCol("source")
.setOutputCol("invoices")
.setConcurrency(5))

(analyzeInvoices
.transform(imageDf)
.withColumn("documents", explode(col("invoices.analyzeResult.documentResults.fields")))
.select("source", "documents")).show()
Python API: AnalyzeInvoicesScala API: AnalyzeInvoices.NET API: AnalyzeInvoicesSource: AnalyzeInvoices

AnalyzeIDDocuments

from synapse.ml.cognitive import *

cognitiveKey = os.environ.get("COGNITIVE_API_KEY", getSecret("cognitive-api-key"))
imageDf = spark.createDataFrame([
("https://mmlspark.blob.core.windows.net/datasets/FormRecognizer/id1.jpg",)
], ["source",])

analyzeIDDocuments = (AnalyzeIDDocuments()
.setSubscriptionKey(cognitiveKey)
.setLocation("eastus")
.setImageUrlCol("source")
.setOutputCol("ids")
.setConcurrency(5))

(analyzeIDDocuments
.transform(imageDf)
.withColumn("documents", explode(col("ids.analyzeResult.documentResults.fields")))
.select("source", "documents")).show()
Python API: AnalyzeIDDocumentsScala API: AnalyzeIDDocuments.NET API: AnalyzeIDDocumentsSource: AnalyzeIDDocuments

AnalyzeCustomModel

from synapse.ml.cognitive import *

cognitiveKey = os.environ.get("COGNITIVE_API_KEY", getSecret("cognitive-api-key"))
modelId = "02bc2f58-2beb-4ae3-84fb-08f011b2f7b8" # put your own modelId here
imageDf = spark.createDataFrame([
("https://mmlspark.blob.core.windows.net/datasets/FormRecognizer/invoice2.png",)
], ["source",])

analyzeCustomModel = (AnalyzeCustomModel()
.setSubscriptionKey(cognitiveKey)
.setLocation("eastus")
.setModelId(modelId)
.setImageUrlCol("source")
.setOutputCol("output")
.setConcurrency(5))

(analyzeCustomModel
.transform(imageDf)
.withColumn("keyValuePairs", flatten(col("output.analyzeResult.pageResults.keyValuePairs")))
.withColumn("keys", col("keyValuePairs.key.text"))
.withColumn("values", col("keyValuePairs.value.text"))
.withColumn("keyValuePairs", create_map(lit("key"), col("keys"), lit("value"), col("values")))
.select("source", "keyValuePairs")).show()
Python API: AnalyzeCustomModelScala API: AnalyzeCustomModel.NET API: AnalyzeCustomModelSource: AnalyzeCustomModel

GetCustomModel

from synapse.ml.cognitive import *

cognitiveKey = os.environ.get("COGNITIVE_API_KEY", getSecret("cognitive-api-key"))
modelId = "02bc2f58-2beb-4ae3-84fb-08f011b2f7b8" # put your own modelId here
emptyDf = spark.createDataFrame([("",)])

getCustomModel = (GetCustomModel()
.setSubscriptionKey(cognitiveKey)
.setLocation("eastus")
.setModelId(modelId)
.setIncludeKeys(True)
.setOutputCol("model")
.setConcurrency(5))

(getCustomModel
.transform(emptyDf)
.withColumn("modelInfo", col("model.ModelInfo"))
.withColumn("trainResult", col("model.TrainResult"))
.select("modelInfo", "trainResult")).show()
Python API: GetCustomModelScala API: GetCustomModel.NET API: GetCustomModelSource: GetCustomModel

ListCustomModels

from synapse.ml.cognitive import *

cognitiveKey = os.environ.get("COGNITIVE_API_KEY", getSecret("cognitive-api-key"))
emptyDf = spark.createDataFrame([("",)])

listCustomModels = (ListCustomModels()
.setSubscriptionKey(cognitiveKey)
.setLocation("eastus")
.setOp("full")
.setOutputCol("models")
.setConcurrency(5))

(listCustomModels
.transform(emptyDf)
.withColumn("modelIds", col("models.modelList.modelId"))
.select("modelIds")).show()
Python API: ListCustomModelsScala API: ListCustomModels.NET API: ListCustomModelsSource: ListCustomModels

Form Recognizer V3

AnalyzeDocument

from synapse.ml.cognitive import *

cognitiveKey = os.environ.get("COGNITIVE_API_KEY", getSecret("cognitive-api-key"))
imageDf = spark.createDataFrame([
("https://mmlspark.blob.core.windows.net/datasets/FormRecognizer/layout1.jpg",)
], ["source",])

analyzeDocument = (AnalyzeDocument()
# For supported prebuilt models, please go to documentation page for details
.setPrebuiltModelId("prebuilt-layout")
.setSubscriptionKey(cognitiveKey)
.setLocation("eastus")
.setImageUrlCol("source")
.setOutputCol("result")
.setConcurrency(5))

(analyzeDocument.transform(imageDf)
.withColumn("content", col("result.analyzeResult.content"))
.withColumn("cells", flatten(col("result.analyzeResult.tables.cells")))
.withColumn("cells", col("cells.content"))
.select("source", "result", "content", "cells")).show()
Python API: AnalyzeDocumentScala API: AnalyzeDocument.NET API: AnalyzeDocumentSource: AnalyzeDocument

Anomaly Detection

DetectLastAnomaly

from synapse.ml.cognitive import *
from pyspark.sql.functions import lit

anomalyKey = os.environ.get("ANOMALY_API_KEY", getSecret("anomaly-api-key"))
df = (spark.createDataFrame([
("1972-01-01T00:00:00Z", 826.0),
("1972-02-01T00:00:00Z", 799.0),
("1972-03-01T00:00:00Z", 890.0),
("1972-04-01T00:00:00Z", 900.0),
("1972-05-01T00:00:00Z", 766.0),
("1972-06-01T00:00:00Z", 805.0),
("1972-07-01T00:00:00Z", 821.0),
("1972-08-01T00:00:00Z", 20000.0),
("1972-09-01T00:00:00Z", 883.0),
("1972-10-01T00:00:00Z", 898.0),
("1972-11-01T00:00:00Z", 957.0),
("1972-12-01T00:00:00Z", 924.0),
("1973-01-01T00:00:00Z", 881.0),
("1973-02-01T00:00:00Z", 837.0),
("1973-03-01T00:00:00Z", 90000.0)
], ["timestamp", "value"])
.withColumn("group", lit(1))
.withColumn("inputs", struct(col("timestamp"), col("value")))
.groupBy(col("group"))
.agg(sort_array(collect_list(col("inputs"))).alias("inputs")))

dla = (DetectLastAnomaly()
.setSubscriptionKey(anomalyKey)
.setLocation("westus2")
.setOutputCol("anomalies")
.setSeriesCol("inputs")
.setGranularity("monthly")
.setErrorCol("errors"))

dla.transform(df).show()
Python API: DetectLastAnomalyScala API: DetectLastAnomaly.NET API: DetectLastAnomalySource: DetectLastAnomaly

DetectAnomalies

from synapse.ml.cognitive import *

anomalyKey = os.environ.get("ANOMALY_API_KEY", getSecret("anomaly-api-key"))
df = (spark.createDataFrame([
("1972-01-01T00:00:00Z", 826.0),
("1972-02-01T00:00:00Z", 799.0),
("1972-03-01T00:00:00Z", 890.0),
("1972-04-01T00:00:00Z", 900.0),
("1972-05-01T00:00:00Z", 766.0),
("1972-06-01T00:00:00Z", 805.0),
("1972-07-01T00:00:00Z", 821.0),
("1972-08-01T00:00:00Z", 20000.0),
("1972-09-01T00:00:00Z", 883.0),
("1972-10-01T00:00:00Z", 898.0),
("1972-11-01T00:00:00Z", 957.0),
("1972-12-01T00:00:00Z", 924.0),
("1973-01-01T00:00:00Z", 881.0),
("1973-02-01T00:00:00Z", 837.0),
("1973-03-01T00:00:00Z", 90000.0)
], ["timestamp", "value"])
.withColumn("group", lit(1))
.withColumn("inputs", struct(col("timestamp"), col("value")))
.groupBy(col("group"))
.agg(sort_array(collect_list(col("inputs"))).alias("inputs")))

da = (DetectAnomalies()
.setSubscriptionKey(anomalyKey)
.setLocation("westus2")
.setOutputCol("anomalies")
.setSeriesCol("inputs")
.setGranularity("monthly"))

da.transform(df).show()
Python API: DetectAnomaliesScala API: DetectAnomalies.NET API: DetectAnomaliesSource: DetectAnomalies

SimpleDetectAnomalies

from synapse.ml.cognitive import *

anomalyKey = os.environ.get("ANOMALY_API_KEY", getSecret("anomaly-api-key"))
df = (spark.createDataFrame([
("1972-01-01T00:00:00Z", 826.0, 1.0),
("1972-02-01T00:00:00Z", 799.0, 1.0),
("1972-03-01T00:00:00Z", 890.0, 1.0),
("1972-04-01T00:00:00Z", 900.0, 1.0),
("1972-05-01T00:00:00Z", 766.0, 1.0),
("1972-06-01T00:00:00Z", 805.0, 1.0),
("1972-07-01T00:00:00Z", 821.0, 1.0),
("1972-08-01T00:00:00Z", 20000.0, 1.0),
("1972-09-01T00:00:00Z", 883.0, 1.0),
("1972-10-01T00:00:00Z", 898.0, 1.0),
("1972-11-01T00:00:00Z", 957.0, 1.0),
("1972-12-01T00:00:00Z", 924.0, 1.0),
("1973-01-01T00:00:00Z", 881.0, 1.0),
("1973-02-01T00:00:00Z", 837.0, 1.0),
("1973-03-01T00:00:00Z", 90000.0, 1.0),
("1972-01-01T00:00:00Z", 826.0, 2.0),
("1972-02-01T00:00:00Z", 799.0, 2.0),
("1972-03-01T00:00:00Z", 890.0, 2.0),
("1972-04-01T00:00:00Z", 900.0, 2.0),
("1972-05-01T00:00:00Z", 766.0, 2.0),
("1972-06-01T00:00:00Z", 805.0, 2.0),
("1972-07-01T00:00:00Z", 821.0, 2.0),
("1972-08-01T00:00:00Z", 20000.0, 2.0),
("1972-09-01T00:00:00Z", 883.0, 2.0),
("1972-10-01T00:00:00Z", 898.0, 2.0),
("1972-11-01T00:00:00Z", 957.0, 2.0),
("1972-12-01T00:00:00Z", 924.0, 2.0),
("1973-01-01T00:00:00Z", 881.0, 2.0),
("1973-02-01T00:00:00Z", 837.0, 2.0),
("1973-03-01T00:00:00Z", 90000.0, 2.0)
], ["timestamp", "value", "group"]))

sda = (SimpleDetectAnomalies()
.setSubscriptionKey(anomalyKey)
.setLocation("westus2")
.setOutputCol("anomalies")
.setGroupbyCol("group")
.setGranularity("monthly"))

sda.transform(df).show()
Python API: SimpleDetectAnomaliesScala API: SimpleDetectAnomalies.NET API: SimpleDetectAnomaliesSource: SimpleDetectAnomalies

Face

DetectFace

from synapse.ml.cognitive import *

cognitiveKey = os.environ.get("COGNITIVE_API_KEY", getSecret("cognitive-api-key"))
df = spark.createDataFrame([
("https://mmlspark.blob.core.windows.net/datasets/DSIR/test1.jpg",),
], ["url"])

face = (DetectFace()
.setSubscriptionKey(cognitiveKey)
.setLocation("eastus")
.setImageUrlCol("url")
.setOutputCol("detected_faces")
.setReturnFaceId(True)
.setReturnFaceLandmarks(False)
.setReturnFaceAttributes(["age", "gender", "headPose", "smile", "facialHair", "glasses", "emotion",
"hair", "makeup", "occlusion", "accessories", "blur", "exposure", "noise"]))

face.transform(df).show()
Python API: DetectFaceScala API: DetectFace.NET API: DetectFaceSource: DetectFace

FindSimilarFace

from synapse.ml.cognitive import *

cognitiveKey = os.environ.get("COGNITIVE_API_KEY", getSecret("cognitive-api-key"))
df = spark.createDataFrame([
("https://mmlspark.blob.core.windows.net/datasets/DSIR/test1.jpg",),
("https://mmlspark.blob.core.windows.net/datasets/DSIR/test2.jpg",),
("https://mmlspark.blob.core.windows.net/datasets/DSIR/test3.jpg",)
], ["url"])

detector = (DetectFace()
.setSubscriptionKey(cognitiveKey)
.setLocation("eastus")
.setImageUrlCol("url")
.setOutputCol("detected_faces")
.setReturnFaceId(True)
.setReturnFaceLandmarks(False)
.setReturnFaceAttributes([]))

faceIdDF = detector.transform(df).select("detected_faces").select(col("detected_faces").getItem(0).getItem("faceId").alias("id"))
faceIds = [row.asDict()['id'] for row in faceIdDF.collect()]

findSimilar = (FindSimilarFace()
.setSubscriptionKey(cognitiveKey)
.setLocation("eastus")
.setOutputCol("similar")
.setFaceIdCol("id")
.setFaceIds(faceIds))

findSimilar.transform(faceIdDF).show()
Python API: FindSimilarFaceScala API: FindSimilarFace.NET API: FindSimilarFaceSource: FindSimilarFace

GroupFaces

from synapse.ml.cognitive import *

cognitiveKey = os.environ.get("COGNITIVE_API_KEY", getSecret("cognitive-api-key"))
df = spark.createDataFrame([
("https://mmlspark.blob.core.windows.net/datasets/DSIR/test1.jpg",),
("https://mmlspark.blob.core.windows.net/datasets/DSIR/test2.jpg",),
("https://mmlspark.blob.core.windows.net/datasets/DSIR/test3.jpg",)
], ["url"])

detector = (DetectFace()
.setSubscriptionKey(cognitiveKey)
.setLocation("eastus")
.setImageUrlCol("url")
.setOutputCol("detected_faces")
.setReturnFaceId(True)
.setReturnFaceLandmarks(False)
.setReturnFaceAttributes([]))

faceIdDF = detector.transform(df).select("detected_faces").select(col("detected_faces").getItem(0).getItem("faceId").alias("id"))
faceIds = [row.asDict()['id'] for row in faceIdDF.collect()]

group = (GroupFaces()
.setSubscriptionKey(cognitiveKey)
.setLocation("eastus")
.setOutputCol("grouping")
.setFaceIds(faceIds))

group.transform(faceIdDF).show()
Python API: GroupFacesScala API: GroupFaces.NET API: GroupFacesSource: GroupFaces

IdentifyFaces

from synapse.ml.cognitive import *

cognitiveKey = os.environ.get("COGNITIVE_API_KEY", getSecret("cognitive-api-key"))
pgId = "PUT_YOUR_PERSON_GROUP_ID"

identifyFaces = (IdentifyFaces()
.setSubscriptionKey(cognitiveKey)
.setLocation("eastus")
.setFaceIdsCol("faces")
.setPersonGroupId(pgId)
.setOutputCol("identified_faces"))
Python API: IdentifyFacesScala API: IdentifyFaces.NET API: IdentifyFacesSource: IdentifyFaces

VerifyFaces

from synapse.ml.cognitive import *

cognitiveKey = os.environ.get("COGNITIVE_API_KEY", getSecret("cognitive-api-key"))
df = spark.createDataFrame([
("https://mmlspark.blob.core.windows.net/datasets/DSIR/test1.jpg",),
("https://mmlspark.blob.core.windows.net/datasets/DSIR/test2.jpg",),
("https://mmlspark.blob.core.windows.net/datasets/DSIR/test3.jpg",)
], ["url"])

detector = (DetectFace()
.setSubscriptionKey(cognitiveKey)
.setLocation("eastus")
.setImageUrlCol("url")
.setOutputCol("detected_faces")
.setReturnFaceId(True)
.setReturnFaceLandmarks(False)
.setReturnFaceAttributes([]))

faceIdDF = detector.transform(df).select("detected_faces").select(col("detected_faces").getItem(0).getItem("faceId").alias("faceId1"))
faceIdDF2 = faceIdDF.withColumn("faceId2", lit(faceIdDF.take(1)[0].asDict()['faceId1']))

verify = (VerifyFaces()
.setSubscriptionKey(cognitiveKey)
.setLocation("eastus")
.setOutputCol("same")
.setFaceId1Col("faceId1")
.setFaceId2Col("faceId2"))

verify.transform(faceIdDF2).show()
Python API: VerifyFacesScala API: VerifyFaces.NET API: VerifyFacesSource: VerifyFaces

Speech To Text

SpeechToText

from synapse.ml.cognitive import *
import requests

cognitiveKey = os.environ.get("COGNITIVE_API_KEY", getSecret("cognitive-api-key"))
link = "https://mmlspark.blob.core.windows.net/datasets/Speech/audio2.wav"
audioBytes = requests.get(link).content
df = spark.createDataFrame([(audioBytes,)
], ["audio"])

stt = (SpeechToText()
.setSubscriptionKey(cognitiveKey)
.setLocation("eastus")
.setOutputCol("text")
.setAudioDataCol("audio")
.setLanguage("en-US")
.setFormat("simple"))

stt.transform(df).show()
Python API: SpeechToTextScala API: SpeechToText.NET API: SpeechToTextSource: SpeechToText

SpeechToTextSDK

from synapse.ml.cognitive import *
import requests

cognitiveKey = os.environ.get("COGNITIVE_API_KEY", getSecret("cognitive-api-key"))
df = spark.createDataFrame([("https://mmlspark.blob.core.windows.net/datasets/Speech/audio2.wav",)
], ["url"])

speech_to_text = (SpeechToTextSDK()
.setSubscriptionKey(cognitiveKey)
.setLocation("eastus")
.setOutputCol("text")
.setAudioDataCol("url")
.setLanguage("en-US")
.setProfanity("Masked"))

speech_to_text.transform(df).show()
Python API: SpeechToTextSDKScala API: SpeechToTextSDK.NET API: SpeechToTextSDKSource: SpeechToTextSDK

AzureSearch

from synapse.ml.cognitive import *

azureSearchKey = os.environ.get("AZURE_SEARCH_KEY", getSecret("azure-search-key"))
testServiceName = "mmlspark-azure-search"

indexName = "test-website"

def createSimpleIndexJson(indexName):
json_str = """
{
"name": "%s",
"fields": [
{
"name": "id",
"type": "Edm.String",
"key": true,
"facetable": false
},
{
"name": "fileName",
"type": "Edm.String",
"searchable": false,
"sortable": false,
"facetable": false
},
{
"name": "text",
"type": "Edm.String",
"filterable": false,
"sortable": false,
"facetable": false
}
]
}
"""

return json_str % indexName

df = (spark.createDataFrame([
("upload", "0", "file0", "text0"),
("upload", "1", "file1", "text1"),
("upload", "2", "file2", "text2"),
("upload", "3", "file3", "text3")
], ["searchAction", "id", "fileName", "text"]))

ad = (AddDocuments()
.setSubscriptionKey(azureSearchKey)
.setServiceName(testServiceName)
.setOutputCol("out")
.setErrorCol("err")
.setIndexName(indexName)
.setActionCol("searchAction"))

ad.transform(df).show()

AzureSearchWriter.writeToAzureSearch(df,
subscriptionKey=azureSearchKey,
actionCol="searchAction",
serviceName=testServiceName,
indexJson=createSimpleIndexJson(indexName))
Python API: AzureSearchScala API: AzureSearch.NET API: AzureSearchSource: AzureSearch

BingImageSearch

from synapse.ml.cognitive import *

bingSearchKey = os.environ.get("BING_SEARCH_KEY", getSecret("bing-search-key"))

# Number of images Bing will return per query
imgsPerBatch = 10
# A list of offsets, used to page into the search results
offsets = [(i*imgsPerBatch,) for i in range(100)]
# Since web content is our data, we create a dataframe with options on that data: offsets
bingParameters = spark.createDataFrame(offsets, ["offset"])

# Run the Bing Image Search service with our text query
bingSearch = (BingImageSearch()
.setSubscriptionKey(bingSearchKey)
.setOffsetCol("offset")
.setQuery("Martin Luther King Jr. quotes")
.setCount(imgsPerBatch)
.setOutputCol("images"))

# Transformer that extracts and flattens the richly structured output of Bing Image Search into a simple URL column
getUrls = BingImageSearch.getUrlTransformer("images", "url")

# This displays the full results returned
bingSearch.transform(bingParameters).show()

# Since we have two services, they are put into a pipeline
pipeline = PipelineModel(stages=[bingSearch, getUrls])

# Show the results of your search: image URLs
pipeline.transform(bingParameters).show()

Python API: BingImageSearchScala API: BingImageSearch.NET API: BingImageSearchSource: BingImageSearch