"""
Authors: Arthur Rodrigues Scarpatto and Leonardo de Sousa Marques
Affiliation: Embedded Computing Lab (ECL), Federal University of Santa Catarina (UFSC)
Description:
Calculates the statistical significance of speedup results based on the Wilcoxon-Mann-Whitney test.
Uses configuration file to automatically find baseline and codec results paths.
"""
import argparse
import json
import math
import os
import sys
from enum import Enum
from pathlib import Path
from typing import Dict, List, Tuple
from scipy.stats import mannwhitneyu
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
from lfc_toolkit.src.configuration.configuration_reader import read_config_from_argv
ALPHA = 0.05 # Confidence interval would be 1 - ALPHA
VERBOSITY = 0
[docs]
class SoftwareType(Enum):
ENCODER = "encoder"
DECODER = "decoder"
[docs]
class ArgumentMismatchError(ValueError):
"""Exception raised when a given Lightfield or BPP configuration is not present in a result when it should be."""
[docs]
def __init__(self, arg_name: str, file: str):
"""Initializes ArgumentMismatchError.
:param arg_name: Name of the argument that caused the mismatch
:type arg_name: str
:param file: Path or identifier of the file where the mismatch occurred
:type file: str
:return: None
:rtype: None
"""
self.arg_name = arg_name
self.file = file
super().__init__(f"Argument mismatch: '{arg_name}' should be present in the results contained at {file}")
[docs]
def log(msg: str, verbose: bool = False) -> None:
"""Logs a message to stdout, with optional verbose filtering.
:param msg: Message to log
:type msg: str
:param verbose: If True, only logs when VERBOSITY > 0
:type verbose: bool
:return: None
:rtype: None
"""
if VERBOSITY > 0 and verbose:
print("[VERBOSE] - " + msg)
if not verbose:
print(msg)
[docs]
def komolgorov_smirnov(X: List[float], Y: List[float]) -> bool:
"""Performs the Komolgorov-Smirnov two-sample test to check if two distributions differ only by a shift.
Given two samples X and Y containing floating point values, normalized by the median and sorted ascendingly,
calculates whether the distributions of X and Y are the same, shifting only by a Delta (Fy(t) = Fx(t + delta)).
:param X: First sample of floating point values
:type X: List[float]
:param Y: Second sample of floating point values
:type Y: List[float]
:return: True if distributions are the same (differ only by delta), False otherwise
:rtype: bool
"""
# source: https://www.itl.nist.gov/div898/software/dataplot/refman1/auxillar/ks2samp.htm
combined_sample = sorted(X + Y)
max_distance = 0
for val in combined_sample:
percent_X_below_val = sum(1 for x in X if x <= val) / len(X)
percent_Y_below_val = sum(1 for y in Y if y <= val) / len(Y)
distance = abs(percent_Y_below_val - percent_X_below_val)
if distance > max_distance:
max_distance = distance
c_alpha = 1.36 # this is a pre-computed constant for alpha = 5%
critical_distance = c_alpha * math.sqrt((len(X) + len(Y)) / (len(X) * len(Y)))
if max_distance > critical_distance:
# Reject null hypothesis - the distributions differ significantly
return False
# Accept hypothesis - the distributions are the same, differing only by a delta
return True
[docs]
def get_statistical_significances(X: List[float], Y: List[float]) -> Tuple[bool, str]:
"""Calculates Wilcoxon-Mann-Whitney statistical significance between baseline and result samples.
Follows the flow outlined by Touati et al. (2012).
:param X: Baseline sample
:type X: List[float]
:param Y: Result sample
:type Y: List[float]
:return: Tuple of (is_significant, message)
:rtype: Tuple[bool, str]
"""
sorted_X = sorted(X)
sorted_Y = sorted(Y)
length = len(X)
if length % 2 == 0:
median_idx = length // 2
median_X = sorted_X[median_idx - 1] # 0-indexed
median_Y = sorted_Y[median_idx - 1] # 0-indexed
norm_X = [x - median_X for x in sorted_X]
norm_Y = [y - median_Y for y in sorted_Y]
else:
median_idx = length // 2
median_X = sorted_X[median_idx]
median_Y = sorted_Y[median_idx]
norm_X = [x - median_X for x in sorted_X]
norm_Y = [y - median_Y for y in sorted_Y]
# Checks if this is a valid candidate for a WMW (Wilcoxon-Mann-Whitney) test
is_valid_wmw = komolgorov_smirnov(norm_X, norm_Y)
if not is_valid_wmw:
log("Result set is not valid for the Wilcoxon-Mann-Whitney statistical significance check.", verbose=True)
if len(X) < 30 or len(Y) < 30:
log("Not enough values to conclude anything. Please run more than 30 runs on both samples.", verbose=True)
return False, f"Results do not pass the Komolgorov-Smirnov test, but there are not enough values to conclude. Run more than 30 runs on both samples and try again. Length of sample X = {len(X)}, of sample Y = {len(Y)}"
log("Result set is valid for the Wilcoxon-Mann-Whitney two-sample statistical significance check.", verbose=True)
log("Calculating the unpaired, one-sided Wilcoxon-Mann-Whitney test.", verbose=True)
log(f"First few X values: {X[:3]}", verbose=True)
log(f"First few Y values: {Y[:3]}", verbose=True)
log(f"X length: {len(X)}, Y length: {len(Y)}", verbose=True)
result = mannwhitneyu(X, Y, alternative='greater', method='exact')
if result.pvalue < ALPHA:
log(f"result is statistically significant with p-value = {result.pvalue} and alpha {ALPHA}", verbose=True)
log(f"The confidence level is therefore 1 - alpha = {1 - ALPHA}", verbose=True)
return True, ""
else:
log(f"result is not statistically significant, with p-value = {result.pvalue} and alpha {ALPHA}", verbose=True)
return False, f"p-value {result.pvalue} is not smaller than the alpha {ALPHA}"
[docs]
def find_codec_json_file(results_path: Path) -> Path:
"""Finds the JSON log file in the results path.
:param results_path: Path to the results directory
:type results_path: Path
:return: Path to the first JSON file found
:rtype: Path
:raises FileNotFoundError: If no JSON file is found in the path
"""
json_files = list(results_path.glob("*.json"))
if not json_files:
raise FileNotFoundError(f"No JSON log file found in {results_path}")
return json_files[0] # Use first JSON file found
[docs]
def main() -> None:
global VERBOSITY
# Read configuration from command line arguments FIRST (before argparse)
# This is the same pattern used in speedup.py and script.py
base_path = Path(os.path.abspath(os.path.dirname(sys.argv[0])))
configuration = read_config_from_argv(overriden_base_path=base_path / "..")
# Now parse remaining arguments (only verbose flag)
parser = argparse.ArgumentParser(
description="Calculates the statistical significance of speedup results based on the Wilcoxon-Mann-Whitney test.",
formatter_class=argparse.RawDescriptionHelpFormatter
)
parser.add_argument('-v', '--verbose', action='store_true', help="Increase output verbosity.")
args, _remaining = parser.parse_known_args()
if args.verbose:
VERBOSITY = 1
# Get performance configurations
performance_configurations = configuration["performance"]
if not performance_configurations:
print("Error: No 'performance' configuration found in configuration file.", file=sys.stderr)
exit(1)
for perf_config in performance_configurations:
print(f"\nProcessing statistical significance tests for performance configuration block...")
# Get codecs to process and baseline
codecs_to_process = perf_config["codecs"].get("analyse", [])
baseline_codec = perf_config["codecs"].get("baseline", None)
if not baseline_codec:
print("Warning: No baseline_codec defined in performance configuration.", file=sys.stderr)
exit(1)
if not codecs_to_process:
print("Warning: No codecs_to_process defined in performance configuration. Skipping.", file=sys.stderr)
exit(1)
if baseline_codec not in codecs_to_process:
print(f"Warning: baseline_codec '{baseline_codec}' not in codecs_to_process. Adding it.", file=sys.stderr)
codecs_to_process = [baseline_codec] + [c for c in codecs_to_process if c != baseline_codec]
# Find baseline JSON file
baseline_codec_config = configuration["codecs"]["configuration"].get(baseline_codec, {})
if not baseline_codec_config:
print(f"Error: No configuration found for baseline codec '{baseline_codec}'", file=sys.stderr)
continue
baseline_results_path = Path(os.path.expandvars(baseline_codec_config.get("results")))
try:
baseline_json_file = find_codec_json_file(baseline_results_path)
log(f"Found baseline log file: {baseline_json_file}")
except FileNotFoundError as e:
print(f"Error: {e}", file=sys.stderr)
continue
# Find JSON files for all other codecs
codec_json_files = {}
for codec_name in codecs_to_process:
if codec_name == baseline_codec:
codec_json_files[codec_name] = baseline_json_file
continue
codec_config = configuration["codecs"]["configuration"].get(codec_name, {})
if not codec_config:
print(f"Warning: No configuration found for codec '{codec_name}'. Skipping.", file=sys.stderr)
continue
results_path = Path(os.path.expandvars(codec_config.get("results")))
try:
json_file = find_codec_json_file(results_path)
codec_json_files[codec_name] = json_file
log(f"Found log file for {codec_name}: {json_file}", verbose=True)
except FileNotFoundError as e:
print(f"Warning: {e}. Skipping codec '{codec_name}'.", file=sys.stderr)
continue
# Run statistical significance tests
for stype in [SoftwareType.ENCODER, SoftwareType.DECODER]:
print(f"\033[94m! Running statistical significance tests for the {stype.value}.\033[0m")
try:
# Extract baseline data
baseline_dict = extract_log_execution_times(baseline_json_file, stype)
# Compare each codec against baseline
for codec_name, codec_json_file in codec_json_files.items():
if codec_name == baseline_codec:
continue # Skip baseline
print(f"\033[94m!! Comparing {codec_name} against baseline {baseline_codec}.\033[0m")
codec_dict = extract_log_execution_times(codec_json_file, stype)
# Compare each lightfield and BPP
for lf, lf_obj in baseline_dict.items():
if not codec_dict.get(lf):
raise ArgumentMismatchError(lf, str(codec_json_file))
for bpp, sample_x in lf_obj.items():
if not codec_dict[lf].get(bpp):
raise ArgumentMismatchError(f"{bpp}", str(codec_json_file))
X = sample_x
Y = codec_dict[lf][bpp]
log(f"X samples from baseline: {X}", verbose=True)
log(f"Y samples from results: {Y}", verbose=True)
significance, cause = get_statistical_significances(X, Y)
prefix = f"[{codec_name} / LF {lf} / BPP {bpp}]"
padded_prefix = f"{prefix:<60}"
if significance:
print(f"\033[32m{padded_prefix}\033[0m \033[94mStatistically significant.\033[0m")
else:
padding = ' '*(len(padded_prefix)//3)
print(f"\033[31m{padded_prefix}\033[0m \033[94mNOT statistically significant.\033[0m")
print(f"{padding} -> {codec_name} compared to baseline {baseline_codec}")
print(f"{padding} -> Reason: {cause}")
print("")
except ArgumentMismatchError as e:
print(f"\033[31mError: {e}\033[0m", file=sys.stderr)
exit(1)
except FileNotFoundError as e:
print(f"\033[31mError: File not found: {e}\033[0m", file=sys.stderr)
exit(1)
except Exception as e:
print(f"\033[31mError: {type(e).__name__}: {e}\033[0m", file=sys.stderr)
exit(1)
if __name__ == "__main__":
main()