Source code for src.performance.statistical_significance

"""
Authors: Arthur Rodrigues Scarpatto and Leonardo de Sousa Marques
Affiliation: Embedded Computing Lab (ECL), Federal University of Santa Catarina (UFSC)

Description:
    Calculates the statistical significance of speedup results based on the Wilcoxon-Mann-Whitney test.
    Uses configuration file to automatically find baseline and codec results paths.
"""

import argparse
import json
import math
import os
import sys
from enum import Enum
from pathlib import Path
from typing import Dict, List, Tuple

from scipy.stats import mannwhitneyu

sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
from lfc_toolkit.src.configuration.configuration_reader import read_config_from_argv

ALPHA = 0.05 # Confidence interval would be 1 - ALPHA
VERBOSITY = 0

[docs] class SoftwareType(Enum): ENCODER = "encoder" DECODER = "decoder"
[docs] class ArgumentMismatchError(ValueError): """Exception raised when a given Lightfield or BPP configuration is not present in a result when it should be."""
[docs] def __init__(self, arg_name: str, file: str): """Initializes ArgumentMismatchError. :param arg_name: Name of the argument that caused the mismatch :type arg_name: str :param file: Path or identifier of the file where the mismatch occurred :type file: str :return: None :rtype: None """ self.arg_name = arg_name self.file = file super().__init__(f"Argument mismatch: '{arg_name}' should be present in the results contained at {file}")
[docs] def log(msg: str, verbose: bool = False) -> None: """Logs a message to stdout, with optional verbose filtering. :param msg: Message to log :type msg: str :param verbose: If True, only logs when VERBOSITY > 0 :type verbose: bool :return: None :rtype: None """ if VERBOSITY > 0 and verbose: print("[VERBOSE] - " + msg) if not verbose: print(msg)
[docs] def komolgorov_smirnov(X: List[float], Y: List[float]) -> bool: """Performs the Komolgorov-Smirnov two-sample test to check if two distributions differ only by a shift. Given two samples X and Y containing floating point values, normalized by the median and sorted ascendingly, calculates whether the distributions of X and Y are the same, shifting only by a Delta (Fy(t) = Fx(t + delta)). :param X: First sample of floating point values :type X: List[float] :param Y: Second sample of floating point values :type Y: List[float] :return: True if distributions are the same (differ only by delta), False otherwise :rtype: bool """ # source: https://www.itl.nist.gov/div898/software/dataplot/refman1/auxillar/ks2samp.htm combined_sample = sorted(X + Y) max_distance = 0 for val in combined_sample: percent_X_below_val = sum(1 for x in X if x <= val) / len(X) percent_Y_below_val = sum(1 for y in Y if y <= val) / len(Y) distance = abs(percent_Y_below_val - percent_X_below_val) if distance > max_distance: max_distance = distance c_alpha = 1.36 # this is a pre-computed constant for alpha = 5% critical_distance = c_alpha * math.sqrt((len(X) + len(Y)) / (len(X) * len(Y))) if max_distance > critical_distance: # Reject null hypothesis - the distributions differ significantly return False # Accept hypothesis - the distributions are the same, differing only by a delta return True
[docs] def extract_log_execution_times( filepath: Path, software_type: SoftwareType ) -> Dict[str, Dict[float, List[float]]]: """ Given a file representing an execution log in the new format, extracts from it a dictionary of the form { "lf_1": { "bpp_1": [sample_1, sample_2, ...], (...) "bpp_n": [sample_n, ...], }, (...) "lf_n": { "bpp_1": [sample_1, ...], (...) "bpp_n": [sample_n, ...], } }. Uses the new log format with log_data containing time_ns directly. """ exec_time_dict = {} with open(filepath, 'r') as log_file: log_json = json.load(log_file) # Handle both list and dict formats if isinstance(log_json, list): items = log_json[0]["results"].items() else: items = log_json["results"].items() for lf, lf_obj in items: exec_time_dict[lf] = {} op_data = lf_obj.get(software_type.value, {}) for bpp_str, bpp_obj in op_data.items(): try: bpp = float(bpp_str) log_data = bpp_obj.get('log_data', []) # Extract time_ns values from log_data entries time_values = [] for log_entry in log_data: if 'time_ns' in log_entry: time_values.append(float(log_entry['time_ns'])) if time_values: exec_time_dict[lf][bpp] = time_values except (ValueError, TypeError, KeyError) as e: log(f"Warning: Could not extract time_ns for {lf} {bpp_str} {software_type.value}: {e}", verbose=True) continue return exec_time_dict
[docs] def get_statistical_significances(X: List[float], Y: List[float]) -> Tuple[bool, str]: """Calculates Wilcoxon-Mann-Whitney statistical significance between baseline and result samples. Follows the flow outlined by Touati et al. (2012). :param X: Baseline sample :type X: List[float] :param Y: Result sample :type Y: List[float] :return: Tuple of (is_significant, message) :rtype: Tuple[bool, str] """ sorted_X = sorted(X) sorted_Y = sorted(Y) length = len(X) if length % 2 == 0: median_idx = length // 2 median_X = sorted_X[median_idx - 1] # 0-indexed median_Y = sorted_Y[median_idx - 1] # 0-indexed norm_X = [x - median_X for x in sorted_X] norm_Y = [y - median_Y for y in sorted_Y] else: median_idx = length // 2 median_X = sorted_X[median_idx] median_Y = sorted_Y[median_idx] norm_X = [x - median_X for x in sorted_X] norm_Y = [y - median_Y for y in sorted_Y] # Checks if this is a valid candidate for a WMW (Wilcoxon-Mann-Whitney) test is_valid_wmw = komolgorov_smirnov(norm_X, norm_Y) if not is_valid_wmw: log("Result set is not valid for the Wilcoxon-Mann-Whitney statistical significance check.", verbose=True) if len(X) < 30 or len(Y) < 30: log("Not enough values to conclude anything. Please run more than 30 runs on both samples.", verbose=True) return False, f"Results do not pass the Komolgorov-Smirnov test, but there are not enough values to conclude. Run more than 30 runs on both samples and try again. Length of sample X = {len(X)}, of sample Y = {len(Y)}" log("Result set is valid for the Wilcoxon-Mann-Whitney two-sample statistical significance check.", verbose=True) log("Calculating the unpaired, one-sided Wilcoxon-Mann-Whitney test.", verbose=True) log(f"First few X values: {X[:3]}", verbose=True) log(f"First few Y values: {Y[:3]}", verbose=True) log(f"X length: {len(X)}, Y length: {len(Y)}", verbose=True) result = mannwhitneyu(X, Y, alternative='greater', method='exact') if result.pvalue < ALPHA: log(f"result is statistically significant with p-value = {result.pvalue} and alpha {ALPHA}", verbose=True) log(f"The confidence level is therefore 1 - alpha = {1 - ALPHA}", verbose=True) return True, "" else: log(f"result is not statistically significant, with p-value = {result.pvalue} and alpha {ALPHA}", verbose=True) return False, f"p-value {result.pvalue} is not smaller than the alpha {ALPHA}"
[docs] def find_codec_json_file(results_path: Path) -> Path: """Finds the JSON log file in the results path. :param results_path: Path to the results directory :type results_path: Path :return: Path to the first JSON file found :rtype: Path :raises FileNotFoundError: If no JSON file is found in the path """ json_files = list(results_path.glob("*.json")) if not json_files: raise FileNotFoundError(f"No JSON log file found in {results_path}") return json_files[0] # Use first JSON file found
[docs] def main() -> None: global VERBOSITY # Read configuration from command line arguments FIRST (before argparse) # This is the same pattern used in speedup.py and script.py base_path = Path(os.path.abspath(os.path.dirname(sys.argv[0]))) configuration = read_config_from_argv(overriden_base_path=base_path / "..") # Now parse remaining arguments (only verbose flag) parser = argparse.ArgumentParser( description="Calculates the statistical significance of speedup results based on the Wilcoxon-Mann-Whitney test.", formatter_class=argparse.RawDescriptionHelpFormatter ) parser.add_argument('-v', '--verbose', action='store_true', help="Increase output verbosity.") args, _remaining = parser.parse_known_args() if args.verbose: VERBOSITY = 1 # Get performance configurations performance_configurations = configuration["performance"] if not performance_configurations: print("Error: No 'performance' configuration found in configuration file.", file=sys.stderr) exit(1) for perf_config in performance_configurations: print(f"\nProcessing statistical significance tests for performance configuration block...") # Get codecs to process and baseline codecs_to_process = perf_config["codecs"].get("analyse", []) baseline_codec = perf_config["codecs"].get("baseline", None) if not baseline_codec: print("Warning: No baseline_codec defined in performance configuration.", file=sys.stderr) exit(1) if not codecs_to_process: print("Warning: No codecs_to_process defined in performance configuration. Skipping.", file=sys.stderr) exit(1) if baseline_codec not in codecs_to_process: print(f"Warning: baseline_codec '{baseline_codec}' not in codecs_to_process. Adding it.", file=sys.stderr) codecs_to_process = [baseline_codec] + [c for c in codecs_to_process if c != baseline_codec] # Find baseline JSON file baseline_codec_config = configuration["codecs"]["configuration"].get(baseline_codec, {}) if not baseline_codec_config: print(f"Error: No configuration found for baseline codec '{baseline_codec}'", file=sys.stderr) continue baseline_results_path = Path(os.path.expandvars(baseline_codec_config.get("results"))) try: baseline_json_file = find_codec_json_file(baseline_results_path) log(f"Found baseline log file: {baseline_json_file}") except FileNotFoundError as e: print(f"Error: {e}", file=sys.stderr) continue # Find JSON files for all other codecs codec_json_files = {} for codec_name in codecs_to_process: if codec_name == baseline_codec: codec_json_files[codec_name] = baseline_json_file continue codec_config = configuration["codecs"]["configuration"].get(codec_name, {}) if not codec_config: print(f"Warning: No configuration found for codec '{codec_name}'. Skipping.", file=sys.stderr) continue results_path = Path(os.path.expandvars(codec_config.get("results"))) try: json_file = find_codec_json_file(results_path) codec_json_files[codec_name] = json_file log(f"Found log file for {codec_name}: {json_file}", verbose=True) except FileNotFoundError as e: print(f"Warning: {e}. Skipping codec '{codec_name}'.", file=sys.stderr) continue # Run statistical significance tests for stype in [SoftwareType.ENCODER, SoftwareType.DECODER]: print(f"\033[94m! Running statistical significance tests for the {stype.value}.\033[0m") try: # Extract baseline data baseline_dict = extract_log_execution_times(baseline_json_file, stype) # Compare each codec against baseline for codec_name, codec_json_file in codec_json_files.items(): if codec_name == baseline_codec: continue # Skip baseline print(f"\033[94m!! Comparing {codec_name} against baseline {baseline_codec}.\033[0m") codec_dict = extract_log_execution_times(codec_json_file, stype) # Compare each lightfield and BPP for lf, lf_obj in baseline_dict.items(): if not codec_dict.get(lf): raise ArgumentMismatchError(lf, str(codec_json_file)) for bpp, sample_x in lf_obj.items(): if not codec_dict[lf].get(bpp): raise ArgumentMismatchError(f"{bpp}", str(codec_json_file)) X = sample_x Y = codec_dict[lf][bpp] log(f"X samples from baseline: {X}", verbose=True) log(f"Y samples from results: {Y}", verbose=True) significance, cause = get_statistical_significances(X, Y) prefix = f"[{codec_name} / LF {lf} / BPP {bpp}]" padded_prefix = f"{prefix:<60}" if significance: print(f"\033[32m{padded_prefix}\033[0m \033[94mStatistically significant.\033[0m") else: padding = ' '*(len(padded_prefix)//3) print(f"\033[31m{padded_prefix}\033[0m \033[94mNOT statistically significant.\033[0m") print(f"{padding} -> {codec_name} compared to baseline {baseline_codec}") print(f"{padding} -> Reason: {cause}") print("") except ArgumentMismatchError as e: print(f"\033[31mError: {e}\033[0m", file=sys.stderr) exit(1) except FileNotFoundError as e: print(f"\033[31mError: File not found: {e}\033[0m", file=sys.stderr) exit(1) except Exception as e: print(f"\033[31mError: {type(e).__name__}: {e}\033[0m", file=sys.stderr) exit(1)
if __name__ == "__main__": main()