Skip to main content
Version: 1.0.4

Explainers

ImageLIME

from synapse.ml.explainers import *
from synapse.ml.onnx import ONNXModel

model = ONNXModel()

lime = (ImageLIME()
.setModel(model)
.setOutputCol("weights")
.setInputCol("image")
.setCellSize(150.0)
.setModifier(50.0)
.setNumSamples(500)
.setTargetCol("probability")
.setTargetClassesCol("top2pred")
.setSamplingFraction(0.7))
Python API: ImageLIMEScala API: ImageLIMESource: ImageLIME

ImageSHAP

from synapse.ml.explainers import *
from synapse.ml.onnx import ONNXModel

model = ONNXModel()

shap = (
ImageSHAP()
.setModel(model)
.setOutputCol("shaps")
.setSuperpixelCol("superpixels")
.setInputCol("image")
.setCellSize(150.0)
.setModifier(50.0)
.setNumSamples(500)
.setTargetCol("probability")
.setTargetClassesCol("top2pred")
)
Python API: ImageSHAPScala API: ImageSHAPSource: ImageSHAP

TabularLIME

from synapse.ml.explainers import *
from synapse.ml.onnx import ONNXModel

model = ONNXModel()
data = spark.createDataFrame([
(-6.0, 0),
(-5.0, 0),
(5.0, 1),
(6.0, 1)
], ["col1", "label"])

lime = (TabularLIME()
.setModel(model)
.setInputCols(["col1"])
.setOutputCol("weights")
.setBackgroundData(data)
.setKernelWidth(0.001)
.setNumSamples(1000)
.setTargetCol("probability")
.setTargetClasses([0, 1]))
Python API: TabularLIMEScala API: TabularLIMESource: TabularLIME

TabularSHAP

from synapse.ml.explainers import *
from synapse.ml.onnx import ONNXModel

model = ONNXModel()
data = spark.createDataFrame([
(-5.0, "a", -5.0, 0),
(-5.0, "b", -5.0, 0),
(5.0, "a", 5.0, 1),
(5.0, "b", 5.0, 1)
]*100, ["col1", "label"])

shap = (TabularSHAP()
.setInputCols(["col1", "col2", "col3"])
.setOutputCol("shapValues")
.setBackgroundData(data)
.setNumSamples(1000)
.setModel(model)
.setTargetCol("probability")
.setTargetClasses([1]))
Python API: TabularSHAPScala API: TabularSHAPSource: TabularSHAP

TextLIME

from synapse.ml.explainers import *
from synapse.ml.onnx import ONNXModel

model = ONNXModel()

lime = (TextLIME()
.setModel(model)
.setInputCol("text")
.setTargetCol("prob")
.setTargetClasses([1])
.setOutputCol("weights")
.setTokensCol("tokens")
.setSamplingFraction(0.7)
.setNumSamples(1000))
Python API: TextLIMEScala API: TextLIMESource: TextLIME

TextSHAP

from synapse.ml.explainers import *
from synapse.ml.onnx import ONNXModel

model = ONNXModel()

shap = (TextSHAP()
.setModel(model)
.setInputCol("text")
.setTargetCol("prob")
.setTargetClasses([1])
.setOutputCol("weights")
.setTokensCol("tokens")
.setNumSamples(1000))
Python API: TextSHAPScala API: TextSHAPSource: TextSHAP

VectorLIME

from synapse.ml.explainers import *
from synapse.ml.onnx import ONNXModel

model = ONNXModel()

df = spark.createDataFrame([
([0.2729799734928408, -0.4637273304253777, 1.565593782147994], 4.541185129673482),
([1.9511879801376864, 1.495644437589599, -0.4667847796501322], 0.19526424470709836)
])

lime = (VectorLIME()
.setModel(model)
.setBackgroundData(df)
.setInputCol("features")
.setTargetCol("label")
.setOutputCol("weights")
.setNumSamples(1000))
Python API: VectorLIMEScala API: VectorLIMESource: VectorLIME

VectorSHAP

from synapse.ml.explainers import *
from synapse.ml.onnx import ONNXModel

model = ONNXModel()

shap = (VectorSHAP()
.setInputCol("features")
.setOutputCol("shapValues")
.setNumSamples(1000)
.setModel(model)
.setTargetCol("probability")
.setTargetClasses([1]))
Python API: VectorSHAPScala API: VectorSHAPSource: VectorSHAP

Featurize

DataConversion

from synapse.ml.featurize import *

df = spark.createDataFrame([
(True, 1, 2, 3, 4, 5.0, 6.0, "7", "8.0"),
(False, 9, 10, 11, 12, 14.5, 15.5, "16", "17.456"),
(True, -127, 345, 666, 1234, 18.91, 20.21, "100", "200.12345")
], ["bool", "byte", "short", "int", "long", "float", "double", "intstring", "doublestring"])

dc = (DataConversion()
.setCols(["byte"])
.setConvertTo("boolean"))

dc.transform(df).show()
Python API: DataConversionScala API: DataConversionSource: DataConversion

IndexToValue

from synapse.ml.featurize import *

df = spark.createDataFrame([
(-3, 24, 0.32534, True, "piano"),
(1, 5, 5.67, False, "piano"),
(-3, 5, 0.32534, False, "guitar")
], ["int", "long", "double", "bool", "string"])

df2 = ValueIndexer().setInputCol("string").setOutputCol("string_cat").fit(df).transform(df)

itv = (IndexToValue()
.setInputCol("string_cat")
.setOutputCol("string_noncat"))

itv.transform(df2).show()
Python API: IndexToValueScala API: IndexToValueSource: IndexToValue

Featurize Text

MultiNGram

from synapse.ml.featurize.text import *
from pyspark.ml.feature import Tokenizer

dfRaw = spark.createDataFrame([
(0, "Hi I"),
(1, "I wish for snow today"),
(2, "we Cant go to the park, because of the snow!"),
(3, ""),
(4, "1 2 3 4 5 6 7 8 9")
], ["label", "sentence"])

dfTok = (Tokenizer()
.setInputCol("sentence")
.setOutputCol("tokens")
.transform(dfRaw))

mng = (MultiNGram()
.setLengths([1, 3, 4])
.setInputCol("tokens")
.setOutputCol("ngrams"))

mng.transform(dfTok).show()
Python API: MultiNGramScala API: MultiNGramSource: MultiNGram

PageSplitter

from synapse.ml.featurize.text import *

df = spark.createDataFrame([
("words words words wornssaa ehewjkdiw weijnsikjn xnh", ),
("s s s s s s", ),
("hsjbhjhnskjhndwjnbvckjbnwkjwenbvfkjhbnwevkjhbnwejhkbnvjkhnbndjkbnd", ),
("hsjbhjhnskjhndwjnbvckjbnwkjwenbvfkjhbnwevkjhbnwejhkbnvjkhnbndjkbnd 190872340870271091309831097813097130i3u709781", ),
("", ),
(None, )
], ["text"])

ps = (PageSplitter()
.setInputCol("text")
.setMaximumPageLength(20)
.setMinimumPageLength(10)
.setOutputCol("pages"))

ps.transform(df).show()
Python API: PageSplitterScala API: PageSplitterSource: PageSplitter

Image

UnrollImage

from synapse.ml.image import *
from azure.storage.blob import *
# images = (spark.read.format("image")
# .option("dropInvalid", True)
# .load("wasbs://datasets@mmlspark.blob.core.windows.net/LIME/greyscale.jpg"))
# rit = (ResizeImageTransformer()
# .setOutputCol("out")
# .setHeight(15)
# .setWidth(10))
# preprocessed = rit.transform(images)
unroll = (UnrollImage()
.setInputCol("out")
.setOutputCol("final"))
# unroll.transform(preprocessed).show()
Python API: UnrollImageScala API: UnrollImageSource: UnrollImage

UnrollBinaryImage

from synapse.ml.image import *
unroll = (UnrollBinaryImage()
.setInputCol("input_col")
.setOutputCol("final"))
Python API: UnrollBinaryImageScala API: UnrollBinaryImageSource: UnrollBinaryImage

SuperpixelTransformer

from synapse.ml.image import *

spt = (SuperpixelTransformer()
.setInputCol("images"))
Python API: SuperpixelTransformerScala API: SuperpixelTransformerSource: SuperpixelTransformer

IO

HTTPTransformer

from synapse.ml.io.http import *
from pyspark.sql.functions import udf, col
from requests import Request

def world_bank_request(country):
return Request("GET", "http://api.worldbank.org/v2/country/{}?format=json".format(country))

df = (spark.createDataFrame([("br",), ("usa",)], ["country"])
.withColumn("request", http_udf(world_bank_request)(col("country"))))

ht = (HTTPTransformer()
.setConcurrency(3)
.setInputCol("request")
.setOutputCol("response"))

ht.transform(df).show()
Python API: HTTPTransformerScala API: HTTPTransformerSource: HTTPTransformer

SimpleHTTPTransformer

from synapse.ml.io.http import *
from pyspark.sql.types import StringType, StructType

sht = (SimpleHTTPTransformer()
.setInputCol("data")
.setOutputParser(JSONOutputParser()
.setDataType(StructType().add("blah", StringType())))
.setUrl("PUT_YOUR_URL")
.setOutputCol("results")
.setConcurrency(3))
Python API: SimpleHTTPTransformerScala API: SimpleHTTPTransformerSource: SimpleHTTPTransformer

JSONInputParser

from synapse.ml.io.http import *

jsonIP = (JSONInputParser()
.setInputCol("data")
.setOutputCol("out")
.setUrl("PUT_YOUR_URL"))
Python API: JSONInputParserScala API: JSONInputParserSource: JSONInputParser

JSONOutputParser

from synapse.ml.io.http import *
from pyspark.sql.types import StringType, StructType

jsonOP = (JSONOutputParser()
.setDataType(StructType().add("foo", StringType()))
.setInputCol("unparsedOutput")
.setOutputCol("parsedOutput"))
Python API: JSONOutputParserScala API: JSONOutputParserSource: JSONOutputParser

StringOutputParser

from synapse.ml.io.http import *

sop = (StringOutputParser()
.setInputCol("unparsedOutput")
.setOutputCol("out"))
Python API: StringOutputParserScala API: StringOutputParserSource: StringOutputParser

CustomInputParser

from synapse.ml.io.http import *

cip = (CustomInputParser()
.setInputCol("data")
.setOutputCol("out"))
Python API: CustomInputParserScala API: CustomInputParserSource: CustomInputParser

CustomOutputParser

from synapse.ml.io.http import *

cop = (CustomOutputParser()
.setInputCol("unparsedOutput")
.setOutputCol("out"))
Python API: CustomOutputParserScala API: CustomOutputParserSource: CustomOutputParser

Stages

Cacher

from synapse.ml.stages import *

df = (spark.createDataFrame([
(0, "guitars", "drums"),
(1, "piano", "trumpet"),
(2, "bass", "cymbals"),
(3, "guitars", "drums"),
(4, "piano", "trumpet"),
(5, "bass", "cymbals"),
(6, "guitars", "drums"),
(7, "piano", "trumpet"),
(8, "bass", "cymbals"),
(9, "guitars", "drums"),
(10, "piano", "trumpet"),
(11, "bass", "cymbals")
], ["numbers", "words", "more"]))

cacher = Cacher()

cacher.transform(df).show()
Python API: HTTPTransformerScala API: HTTPTransformerSource: HTTPTransformer

DropColumns

from synapse.ml.stages import *

df = (spark.createDataFrame([
(0, 0, "guitars", "drums", 1, True),
(1, 1, "piano", "trumpet", 2, False),
(2, 2, "bass", "cymbals", 3, True)
], ["numbers", "doubles", "words", "more", "longs", "booleans"]))

dc = DropColumns().setCols([])

dc.transform(df).show()
Python API: DropColumnsScala API: DropColumnsSource: DropColumns

EnsembleByKey

from synapse.ml.stages import *
from pyspark.ml.feature import VectorAssembler

scoreDF = (spark.createDataFrame([
(0, "foo", 1.0, .1),
(1, "bar", 4.0, -2.0),
(1, "bar", 0.0, -3.0)
], ["label1", "label2", "score1", "score2"]))

va = VectorAssembler().setInputCols(["score1", "score2"]).setOutputCol("v1")
scoreDF2 = va.transform(scoreDF)

ebk = EnsembleByKey().setKeys(["label1"]).setCols(["score1"])

ebk.transform(scoreDF2).show()
Python API: EnsembleByKeyScala API: EnsembleByKeySource: EnsembleByKey

Explode

from synapse.ml.stages import *

df = (spark.createDataFrame([
(0, ["guitars", "drums"]),
(1, ["piano"]),
(2, [])
], ["numbers", "words"]))

explode = Explode().setInputCol("words").setOutputCol("exploded")

explode.transform(df).show()
Python API: ExplodeScala API: ExplodeSource: Explode

Lambda

from synapse.ml.stages import *
from pyspark.sql.types import StringType, StructType

df = (spark.createDataFrame([
(0, 0.0, "guitars", "drums", 1, True),
(1, 1.0, "piano", "trumpet", 2, False),
(2, 2.0, "bass", "cymbals", 3, True)
], ["numbers", "doubles", "words", "more", "longs", "booleans"]))

def transformFunc(df):
return df.select("numbers")

def transformSchemaFunc(schema):
return StructType([schema("numbers")])

l = (Lambda()
.setTransformFunc(transformFunc)
.setTransformSchemaFunc(transformSchemaFunc))

Python API: LambdaScala API: LambdaSource: Lambda

DynamicMiniBatchTransformer

from synapse.ml.stages import *
from pyspark.sql.types import StringType, StructType

df = (spark.createDataFrame([(_, "foo") for _ in range(1, 11)], ["in1", "in2"]))

dmbt = DynamicMiniBatchTransformer()

dmbt.transform(df).show()
Python API: DynamicMiniBatchTransformerScala API: DynamicMiniBatchTransformerSource: DynamicMiniBatchTransformer

FixedMiniBatchTransformer

from synapse.ml.stages import *

fmbt = (FixedMiniBatchTransformer()
.setBuffered(True)
.setBatchSize(3))
Python API: FixedMiniBatchTransformerScala API: FixedMiniBatchTransformerSource: FixedMiniBatchTransformer

TimeIntervalMiniBatchTransformer

from synapse.ml.stages import *

df = (spark.createDataFrame([(_, "foo") for _ in range(1, 11)], ["in1", "in2"]))

timbt = (TimeIntervalMiniBatchTransformer()
.setMillisToWait(1000)
.setMaxBatchSize(30))

timbt.transform(df).show()
Python API: TimeIntervalMiniBatchTransformerScala API: TimeIntervalMiniBatchTransformerSource: TimeIntervalMiniBatchTransformer

FlattenBatch

from synapse.ml.stages import *

df = (spark.createDataFrame([(_, "foo") for _ in range(1, 11)], ["in1", "in2"]))

transDF = DynamicMiniBatchTransformer().transform(df)

fb = FlattenBatch()

fb.transform(transDF).show()
Python API: FlattenBatchScala API: FlattenBatchSource: FlattenBatch

RenameColumn

from synapse.ml.stages import *

df = (spark.createDataFrame([
(0, 0, "guitars", "drums", 1, True),
(1, 1, "piano", "trumpet", 2, False),
(2, 2, "bass", "cymbals", 3, True)
], ["numbers", "doubles", "words", "more", "longs", "booleans"]))

rc = RenameColumn().setInputCol("words").setOutputCol("numbers")

rc.transform(df).show()
Python API: RenameColumnScala API: RenameColumnSource: RenameColumn

Repartition

from synapse.ml.stages import *

df = (spark.createDataFrame([
(0, "guitars", "drums"),
(1, "piano", "trumpet"),
(2, "bass", "cymbals"),
(3, "guitars", "drums"),
(4, "piano", "trumpet"),
(5, "bass", "cymbals"),
(6, "guitars", "drums"),
(7, "piano", "trumpet"),
(8, "bass", "cymbals"),
(9, "guitars", "drums"),
(10, "piano", "trumpet"),
(11, "bass", "cymbals")
], ["numbers", "words", "more"]))

repartition = Repartition().setN(1)

repartition.transform(df).show()
Python API: RepartitionScala API: RepartitionSource: Repartition

SelectColumns

from synapse.ml.stages import *

df = (spark.createDataFrame([
(0, 0.0, "guitars", "drums", 1, True),
(1, 1.0, "piano", "trumpet", 2, False),
(2, 2.0, "bass", "cymbals", 3, True)
], ["numbers", "words", "more"]))

sc = SelectColumns().setCols(["words", "more"])

sc.transform(df).show()
Python API: SelectColumnsScala API: SelectColumnsSource: SelectColumns

StratifiedRepartition

from synapse.ml.stages import *

df = (spark.createDataFrame([
(0, "Blue", 2),
(0, "Red", 2),
(0, "Green", 2),
(1, "Purple", 2),
(1, "Orange", 2),
(1, "Indigo", 2),
(2, "Violet", 2),
(2, "Black", 2),
(2, "White", 2),
(3, "Gray", 2),
(3, "Yellow", 2),
(3, "Cerulean", 2)
], ["values", "colors", "const"]))

sr = StratifiedRepartition().setLabelCol("values").setMode("equal")
Python API: StratifiedRepartitionScala API: StratifiedRepartitionSource: StratifiedRepartition

SummarizeData

from synapse.ml.stages import *

df = (spark.createDataFrame([
(0, 0.0, "guitars", "drums", 1, True),
(1, 1.0, "piano", "trumpet", 2, False),
(2, 2.0, "bass", "cymbals", 3, True)
], ["numbers", "doubles", "words", "more", "longs", "booleans"]))

summary = SummarizeData()

summary.transform(df).show()
Python API: SummarizeDataScala API: SummarizeDataSource: SummarizeData

TextPreprocessor

from synapse.ml.stages import *

df = (spark.createDataFrame([
("The happy sad boy drank sap", ),
("The hater sad doy drank sap", ),
("foo", ),
("The hater sad doy aABc0123456789Zz_", )
], ["words1"]))

testMap = {"happy": "sad", "hater": "sap",
"sad": "sap", "sad doy": "sap"}

textPreprocessor = (TextPreprocessor()
.setNormFunc("lowerCase")
.setMap(testMap)
.setInputCol("words1")
.setOutputCol("out"))

textPreprocessor.transform(df).show()
Python API: TextPreprocessorScala API: TextPreprocessorSource: TextPreprocessor

UDFTransformer

from synapse.ml.stages import *
from pyspark.sql.functions import udf

df = (spark.createDataFrame([
(0, 0.0, "guitars", "drums", 1, True),
(1, 1.0, "piano", "trumpet", 2, False),
(2, 2.0, "bass", "cymbals", 3, True)
], ["numbers", "doubles", "words", "more", "longs", "booleans"]))

stringToIntegerUDF = udf(lambda x: 1)

udfTransformer = (UDFTransformer()
.setUDF(stringToIntegerUDF)
.setInputCol("numbers")
.setOutputCol("out"))

udfTransformer.transform(df).show()
Python API: UDFTransformerScala API: UDFTransformerSource: UDFTransformer

UnicodeNormalize

from synapse.ml.stages import *

df = (spark.createDataFrame([
("Schön", 1),
("Scho\u0308n", 1),
(None, 1)
], ["words1", "dummy"]))

unicodeNormalize = (UnicodeNormalize()
.setForm("NFC")
.setInputCol("words1")
.setOutputCol("norm1"))

unicodeNormalize.transform(df).show()
Python API: UnicodeNormalizeScala API: UnicodeNormalizeSource: UnicodeNormalize

Train

ComputeModelStatistics

from synapse.ml.train import *
from numpy import random

df = spark.createDataFrame(
[(random.rand(), random.rand()) for _ in range(2048)], ["label", "prediction"]
)

cms = (ComputeModelStatistics()
.setLabelCol("label")
.setScoredLabelsCol("prediction")
.setEvaluationMetric("classification"))

cms.transform(df).show()
Python API: ComputeModelStatisticsScala API: ComputeModelStatisticsSource: ComputeModelStatistics

ComputePerInstanceStatistics

from synapse.ml.train import *

cps = (ComputePerInstanceStatistics()
.setLabelCol("label")
.setScoredLabelsCol("LogRegScoredLabelsCol")
.setScoresCol("LogRegScoresCol")
.setScoredProbabilitiesCol("LogRegProbCol")
.setEvaluationMetric("classification"))
Python API: ComputePerInstanceStatisticsScala API: ComputePerInstanceStatisticsSource: ComputePerInstanceStatistics