Using predicates¶

This example show how the where option can be used with analyzers.

In [1]:

Copied!

from pyspark.sql import SparkSession
import pandas as pd
from pyspark.sql import SparkSession
import pandas as pd

In [2]:

Copied!

spark = SparkSession.builder.remote("sc://localhost:15002").getOrCreate()
spark = SparkSession.builder.remote("sc://localhost:15002").getOrCreate()

In [3]:

Copied!

iris = spark.createDataFrame(pd.read_csv("https://raw.githubusercontent.com/mwaskom/seaborn-data/refs/heads/master/iris.csv"))
iris = spark.createDataFrame(pd.read_csv("https://raw.githubusercontent.com/mwaskom/seaborn-data/refs/heads/master/iris.csv"))

In [4]:

Copied!

iris.show()
iris.show()

+------------+-----------+------------+-----------+-------+
|sepal_length|sepal_width|petal_length|petal_width|species|
+------------+-----------+------------+-----------+-------+
|         5.1|        3.5|         1.4|        0.2| setosa|
|         4.9|        3.0|         1.4|        0.2| setosa|
|         4.7|        3.2|         1.3|        0.2| setosa|
|         4.6|        3.1|         1.5|        0.2| setosa|
|         5.0|        3.6|         1.4|        0.2| setosa|
|         5.4|        3.9|         1.7|        0.4| setosa|
|         4.6|        3.4|         1.4|        0.3| setosa|
|         5.0|        3.4|         1.5|        0.2| setosa|
|         4.4|        2.9|         1.4|        0.2| setosa|
|         4.9|        3.1|         1.5|        0.1| setosa|
|         5.4|        3.7|         1.5|        0.2| setosa|
|         4.8|        3.4|         1.6|        0.2| setosa|
|         4.8|        3.0|         1.4|        0.1| setosa|
|         4.3|        3.0|         1.1|        0.1| setosa|
|         5.8|        4.0|         1.2|        0.2| setosa|
|         5.7|        4.4|         1.5|        0.4| setosa|
|         5.4|        3.9|         1.3|        0.4| setosa|
|         5.1|        3.5|         1.4|        0.3| setosa|
|         5.7|        3.8|         1.7|        0.3| setosa|
|         5.1|        3.8|         1.5|        0.3| setosa|
+------------+-----------+------------+-----------+-------+
only showing top 20 rows

In [5]:

Copied!

from tsumugi.verification import VerificationSuite
from tsumugi.analyzers import Mean
from tsumugi.checks import CheckBuilder
from tsumugi.verification import VerificationSuite
from tsumugi.analyzers import Mean
from tsumugi.checks import CheckBuilder

In [6]:

Copied!





suite = (
    VerificationSuite
    .on_data(iris)
    .add_required_analyzer(
        Mean(column="sepal_length", where="species = 'setosa'")
    )
    .add_required_analyzer(
        Mean(column="sepal_length", where="species = 'versicolor'")
    )
    .add_required_analyzer(
        Mean(column="sepal_length", where="species = 'virginica'")
    )
)
suite = (
    VerificationSuite
    .on_data(iris)
    .add_required_analyzer(
        Mean(column="sepal_length", where="species = 'setosa'")
    )
    .add_required_analyzer(
        Mean(column="sepal_length", where="species = 'versicolor'")
    )
    .add_required_analyzer(
        Mean(column="sepal_length", where="species = 'virginica'")
    )
)

In [7]:

Copied!

result = suite.run()
result = suite.run()

In [8]:

Copied!

result.metrics_as_pandas()
result.metrics_as_pandas()

Out[8]:

	entity	instance	name	value
0	Column	sepal_length	Mean (where: species = 'setosa')	5.006
1	Column	sepal_length	Mean (where: species = 'versicolor')	5.936
2	Column	sepal_length	Mean (where: species = 'virginica')	6.588