Using predicates¶
This example show how the where
option can be used with analyzers.
In [1]:
Copied!
from pyspark.sql import SparkSession
import pandas as pd
from pyspark.sql import SparkSession
import pandas as pd
In [2]:
Copied!
spark = SparkSession.builder.remote("sc://localhost:15002").getOrCreate()
spark = SparkSession.builder.remote("sc://localhost:15002").getOrCreate()
In [3]:
Copied!
iris = spark.createDataFrame(pd.read_csv("https://raw.githubusercontent.com/mwaskom/seaborn-data/refs/heads/master/iris.csv"))
iris = spark.createDataFrame(pd.read_csv("https://raw.githubusercontent.com/mwaskom/seaborn-data/refs/heads/master/iris.csv"))
In [4]:
Copied!
iris.show()
iris.show()
+------------+-----------+------------+-----------+-------+ |sepal_length|sepal_width|petal_length|petal_width|species| +------------+-----------+------------+-----------+-------+ | 5.1| 3.5| 1.4| 0.2| setosa| | 4.9| 3.0| 1.4| 0.2| setosa| | 4.7| 3.2| 1.3| 0.2| setosa| | 4.6| 3.1| 1.5| 0.2| setosa| | 5.0| 3.6| 1.4| 0.2| setosa| | 5.4| 3.9| 1.7| 0.4| setosa| | 4.6| 3.4| 1.4| 0.3| setosa| | 5.0| 3.4| 1.5| 0.2| setosa| | 4.4| 2.9| 1.4| 0.2| setosa| | 4.9| 3.1| 1.5| 0.1| setosa| | 5.4| 3.7| 1.5| 0.2| setosa| | 4.8| 3.4| 1.6| 0.2| setosa| | 4.8| 3.0| 1.4| 0.1| setosa| | 4.3| 3.0| 1.1| 0.1| setosa| | 5.8| 4.0| 1.2| 0.2| setosa| | 5.7| 4.4| 1.5| 0.4| setosa| | 5.4| 3.9| 1.3| 0.4| setosa| | 5.1| 3.5| 1.4| 0.3| setosa| | 5.7| 3.8| 1.7| 0.3| setosa| | 5.1| 3.8| 1.5| 0.3| setosa| +------------+-----------+------------+-----------+-------+ only showing top 20 rows
In [5]:
Copied!
from tsumugi.verification import VerificationSuite
from tsumugi.analyzers import Mean
from tsumugi.checks import CheckBuilder
from tsumugi.verification import VerificationSuite
from tsumugi.analyzers import Mean
from tsumugi.checks import CheckBuilder
In [6]:
Copied!
suite = (
VerificationSuite
.on_data(iris)
.add_required_analyzer(
Mean(column="sepal_length", where="species = 'setosa'")
)
.add_required_analyzer(
Mean(column="sepal_length", where="species = 'versicolor'")
)
.add_required_analyzer(
Mean(column="sepal_length", where="species = 'virginica'")
)
)
suite = (
VerificationSuite
.on_data(iris)
.add_required_analyzer(
Mean(column="sepal_length", where="species = 'setosa'")
)
.add_required_analyzer(
Mean(column="sepal_length", where="species = 'versicolor'")
)
.add_required_analyzer(
Mean(column="sepal_length", where="species = 'virginica'")
)
)
In [7]:
Copied!
result = suite.run()
result = suite.run()
In [8]:
Copied!
result.metrics_as_pandas()
result.metrics_as_pandas()
Out[8]:
entity | instance | name | value | |
---|---|---|---|---|
0 | Column | sepal_length | Mean (where: species = 'setosa') | 5.006 |
1 | Column | sepal_length | Mean (where: species = 'versicolor') | 5.936 |
2 | Column | sepal_length | Mean (where: species = 'virginica') | 6.588 |