calculate_elo_ratings.py 6.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171
  1. """
  2. Elo ratings for olmOCR vs baselines.
  3. See data at scripts/elo/ratings.csv
  4. MethodA,MethodB,A_wins,B_wins,A_rate(%),B_rate(%)
  5. marker,mineru,53,26,67.1,32.9
  6. mineru,pdelf,22,55,28.6,71.4
  7. gotocr_format,marker,26,45,36.6,63.4
  8. marker,pdelf,31,49,38.8,61.3
  9. gotocr_format,pdelf,29,41,41.4,58.6
  10. gotocr_format,mineru,38,37,50.7,49.3
  11. Invoke via
  12. python calculate_elo_ratings.py ratings.csv --num-bootstrap 5000 --num-elo-sims 100 --confidence-level 95 --seed 123
  13. Output:
  14. Bootstrapped Elo Ratings (95% CI):
  15. --------------------------------------------------
  16. pdelf 1813.0 ± 84.9 [1605.9, 1930.0]
  17. mineru 1545.2 ± 99.7 [1336.7, 1714.1]
  18. marker 1429.1 ± 100.7 [1267.6, 1645.5]
  19. gotocr_format 1212.7 ± 82.0 [1097.3, 1408.3]
  20. Pairwise Significance Tests:
  21. --------------------------------------------------
  22. gotocr_format vs marker Δ = -216.3 [-470.8, 135.0] p = 0.218
  23. gotocr_format vs mineru Δ = -332.5 [-567.5, 19.3] p = 0.051
  24. gotocr_format vs pdelf Δ = -600.3 [-826.1, -344.3] p = 0.000*
  25. marker vs mineru Δ = -116.1 [-365.4, 246.5] p = 0.430
  26. marker vs pdelf Δ = -383.9 [-610.6, -10.9] p = 0.044*
  27. mineru vs pdelf Δ = -267.8 [-517.3, 104.0] p = 0.135
  28. @kylel
  29. """
  30. import random
  31. from itertools import combinations
  32. import click
  33. import numpy as np
  34. import pandas as pd
  35. from tqdm import tqdm
  36. def calculate_elo(matches_data, all_methods, k_factor=32, initial_rating=1500, n_replications=10, random_state=None):
  37. """Calculate Elo ratings with multiple replications per dataset"""
  38. all_ratings = {method: [] for method in all_methods}
  39. for _ in range(n_replications):
  40. matches = matches_data.sample(frac=1, replace=False, random_state=random_state).reset_index(drop=True)
  41. ratings = {method: initial_rating for method in all_methods}
  42. for _, row in matches.iterrows():
  43. method_a, method_b = row["MethodA"], row["MethodB"]
  44. a_wins, b_wins = row["A_wins"], row["B_wins"]
  45. for _ in range(int(a_wins)):
  46. ra, rb = update_single_match(ratings[method_a], ratings[method_b], 1, k_factor)
  47. ratings[method_a], ratings[method_b] = ra, rb
  48. for _ in range(int(b_wins)):
  49. ra, rb = update_single_match(ratings[method_a], ratings[method_b], 0, k_factor)
  50. ratings[method_a], ratings[method_b] = ra, rb
  51. for method in all_methods:
  52. all_ratings[method].append(ratings[method])
  53. return {method: np.mean(ratings) for method, ratings in all_ratings.items()}
  54. def update_single_match(rating_a, rating_b, actual_score, k_factor):
  55. """Update ratings for a single match"""
  56. expected_a = 1 / (1 + 10 ** ((rating_b - rating_a) / 400))
  57. new_rating_a = rating_a + k_factor * (actual_score - expected_a)
  58. new_rating_b = rating_b + k_factor * ((1 - actual_score) - (1 - expected_a))
  59. return new_rating_a, new_rating_b
  60. def bootstrap_elo_and_tests(df, num_bootstrap=1000, num_elo_sims=10, confidence_level=95, k_factor=32, initial_rating=1500, random_state=None):
  61. """Calculate bootstrapped Elo ratings with confidence intervals and perform pairwise significance tests"""
  62. ci_lower = (100 - confidence_level) / 2
  63. ci_upper = 100 - ci_lower
  64. all_methods = set(df["MethodA"].unique()) | set(df["MethodB"].unique())
  65. bootstrap_ratings = {method: [] for method in all_methods}
  66. for _ in tqdm(range(num_bootstrap)):
  67. bootstrap_sample = df.sample(n=len(df), replace=True, random_state=random_state)
  68. ratings = calculate_elo(bootstrap_sample, all_methods, k_factor, initial_rating, num_elo_sims)
  69. for method in all_methods:
  70. bootstrap_ratings[method].append(ratings[method])
  71. # Calculate statistics and perform significance tests
  72. results = {}
  73. # Basic statistics
  74. for method in all_methods:
  75. ratings_array = np.array(bootstrap_ratings[method])
  76. results[method] = {
  77. "mean": np.mean(ratings_array),
  78. "std": np.std(ratings_array),
  79. "ci_lower": np.percentile(ratings_array, ci_lower),
  80. "ci_upper": np.percentile(ratings_array, ci_upper),
  81. "bootstrap_samples": ratings_array, # Store for significance testing
  82. }
  83. # Pairwise significance tests
  84. significance_tests = {}
  85. for method1, method2 in combinations(all_methods, 2):
  86. # Calculate difference distribution
  87. diff_distribution = results[method1]["bootstrap_samples"] - results[method2]["bootstrap_samples"]
  88. # Calculate p-value (two-tailed test)
  89. p_value = 2 * min(np.mean(diff_distribution >= 0), np.mean(diff_distribution <= 0))
  90. # Store results
  91. significance_tests[(method1, method2)] = {
  92. "diff_mean": np.mean(diff_distribution),
  93. "diff_ci_lower": np.percentile(diff_distribution, ci_lower),
  94. "diff_ci_upper": np.percentile(diff_distribution, ci_upper),
  95. "p_value": p_value,
  96. }
  97. return results, significance_tests
  98. @click.command()
  99. @click.argument("ratings_file", type=click.Path(exists=True))
  100. @click.option("--num-bootstrap", default=1000, help="Number of bootstrap iterations")
  101. @click.option("--num-elo-sims", default=10, help="Number of ELO simulations per bootstrap")
  102. @click.option("--confidence-level", default=95, help="Confidence level for intervals (in percent)")
  103. @click.option("--seed", default=42, help="Random seed for reproducibility")
  104. def main(ratings_file, num_bootstrap, num_elo_sims, confidence_level, seed):
  105. # Set random seed
  106. random.seed(seed)
  107. np.random.seed(seed)
  108. # Load data
  109. df = pd.read_csv(ratings_file)
  110. # Calculate bootstrapped Elo ratings
  111. results, significance_tests = bootstrap_elo_and_tests(df, num_bootstrap=num_bootstrap, num_elo_sims=num_elo_sims)
  112. # Sort and display results
  113. print(f"\nBootstrapped Elo Ratings ({confidence_level}% CI):")
  114. print("-" * 50)
  115. sorted_results = dict(sorted(results.items(), key=lambda x: x[1]["mean"], reverse=True))
  116. for method, stats in sorted_results.items():
  117. print(f"{method:12} {stats['mean']:6.1f} ± {stats['std']:4.1f} [{stats['ci_lower']:6.1f}, {stats['ci_upper']:6.1f}]")
  118. # Display pairwise significance tests
  119. print("\nPairwise Significance Tests:")
  120. print("-" * 50)
  121. for (method1, method2), stats in significance_tests.items():
  122. sig_marker = "*" if stats["p_value"] < (1 - confidence_level / 100) else " "
  123. print(
  124. f"{method1:12} vs {method2:12} Δ = {stats['diff_mean']:6.1f} "
  125. + f"[{stats['diff_ci_lower']:6.1f}, {stats['diff_ci_upper']:6.1f}] "
  126. + f"p = {stats['p_value']:.3f}{sig_marker}"
  127. )
  128. if __name__ == "__main__":
  129. main()