Skip to main content
Version: 0.9.5

Automated Snow Leopard Detection with Synapse Machine Learning

import os
if os.environ.get("AZURE_SERVICE", None) == "Microsoft.ProjectArcadia":
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()
from notebookutils.mssparkutils.credentials import getSecret
os.environ["BING_IMAGE_SEARCH_KEY"] = getSecret("mmlspark-keys", "bing-image-search-key")

# WARNING this notebook requires alot of memory.
# If you get a heap space error, try dropping the number of images bing returns
# or by writing out the images to parquet first

# Replace the following with a line like: BING_IMAGE_SEARCH_KEY = "hdwo2oyd3o928s....."
from import *
from import FluentAPI
from pyspark.sql.functions import lit

def bingPhotoSearch(name, queries, pages):
offsets = [offset*10 for offset in range(0, pages)]
parameters = [(query, offset) for offset in offsets for query in queries]

return spark.createDataFrame(parameters, ("queries","offsets")) \
BingImageSearch() # Apply Bing Image Search
.setSubscriptionKey(BING_IMAGE_SEARCH_KEY) # Set the API Key
.setOffsetCol("offsets") # Specify a column containing the offsets
.setQueryCol("queries") # Specify a column containing the query words
.setCount(10) # Specify the number of images to return per offset
.setImageType("photo") # Specify a filter to ensure we get photos
.setOutputCol("images")) \
.mlTransform(BingImageSearch.getUrlTransformer("images", "urls")) \
.withColumn("labels", lit(name)) \

def displayDF(df, n=5, image_cols = set(["urls"])):
rows = df.take(n)
cols = df.columns
header = "".join(["<th>" + c + "</th>" for c in cols])

style = """
<!DOCTYPE html>


table = []
for row in rows:
for col in cols:
if col in image_cols:
rep = '<img src="{}", width="100">'.format(row[col])
rep = row[col]
tableHTML = "".join(table)

body = """
""".format(header, tableHTML)
displayHTML(style + body)
snowLeopardQueries = ["snow leopard"]
snowLeopardUrls = bingPhotoSearch("snow leopard", snowLeopardQueries, pages=100)
randomWords ="wasbs://").cache()
randomLinks = randomWords \
.setOutputCol("images")) \
.mlTransform(BingImageSearch.getUrlTransformer("images", "urls")) \
.withColumn("label", lit("other")) \

images = snowLeopardUrls.union(randomLinks).distinct().repartition(100)\
.mlTransform(BingImageSearch.downloadFromUrls("urls", "image", concurrency=5, timeout=5000))\

train, test = images.randomSplit([.7,.3], seed=1)
from import Pipeline
from import StringIndexer
from import LogisticRegression
from pyspark.sql.functions import udf
from import ModelDownloader
from import ImageFeaturizer
from import UDFTransformer
from pyspark.sql.types import *

def getIndex(row):
return float(row[1])

if os.environ.get("AZURE_SERVICE", None) == "Microsoft.ProjectArcadia":
network = ModelDownloader(spark, "abfss://").downloadByName("ResNet50")
network = ModelDownloader(spark, "dbfs:/Models/").downloadByName("ResNet50")

model = Pipeline(stages=[
StringIndexer(inputCol = "labels", outputCol="index"),
ImageFeaturizer(inputCol="image", outputCol="features", cutOutputLayers=1).setModel(network),
LogisticRegression(maxIter=5, labelCol="index", regParam=10.0),
.setUDF(udf(getIndex, DoubleType()))\

fitModel =
def plotConfusionMatrix(df, label, prediction, classLabels):
from import confusionMatrix
import matplotlib.pyplot as plt
fig = plt.figure(figsize=(4.5, 4.5))
confusionMatrix(df, label, prediction, classLabels)

if os.environ.get("AZURE_SERVICE", None) != "Microsoft.ProjectArcadia":
plotConfusionMatrix(fitModel.transform(test), "index", "prediction", fitModel.stages[0].labels)
import urllib.request
from import ImageLIME

test_image_url = ""
with urllib.request.urlopen(test_image_url) as url:
barr =
test_subsample = spark.createDataFrame([(bytearray(barr),)], ["image"])

lime = ImageLIME()\

result = lime.transform(test_subsample)
import matplotlib.pyplot as plt
import PIL, io, numpy as np

def plot_superpixels(row):
image_bytes = row['image']
superpixels = row['superpixels']['clusters']
weights = list(row['weights'])
mean_weight = np.percentile(weights,90)
img = ('RGBA')
image_array = np.asarray(img).copy()
for (sp, w) in zip(superpixels, weights):
if w > mean_weight:
for (x, y) in sp:
image_array[y, x, 1] = 255
image_array[y, x, 3] = 200

# Gets first row from the LIME-transformed data frame
if os.environ.get("AZURE_SERVICE", None) != "Microsoft.ProjectArcadia":

Your results will look like: