open-agent-leaderboard

Running

liaojiajia

add mm results

9ec00c3 6 months ago

9.46 kB

	import pandas as pd
	import json
	from datetime import datetime

	def process_csv_to_json():
	# Read the CSV file
	df = pd.read_csv('src/record.csv')

	# Clean the data: remove empty rows, rename columns
	df = df.dropna(how='all')
	df = df.rename(columns={
	'dataset': 'Dataset',
	'llm': 'LLM',
	'score\n（EM）': 'Score',
	'pass rate': 'Pass rate',
	'Cost($)': 'Cost($)',
	'Eval Date': 'Eval Date',
	'framework': 'Framework',
	'X-shot': 'X-shot',
	'Nums': 'Samples',
	'All tokens': 'All tokens',
	'Total input tokens': 'Total input tokens',
	'Average input tokens': 'Average input tokens',
	'Total output tokens': 'Total output tokens',
	'Average output tokens': 'Average output tokens'
	})

	# Helper function: handle number strings with commas
	def parse_number(value):
	if pd.isna(value) or value == '-':
	return 0
	# Remove commas, convert to float, then to int
	return int(float(str(value).replace(',', '')))

	# Initialize result dictionary
	result = {
	"time": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
	"results": {}
	}

	# Get all unique LLMs
	llms = df['LLM'].dropna().unique()

	# Iterate through each algorithm
	for algorithm in df['Algorithm'].dropna().unique():
	if not isinstance(algorithm, str):
	continue

	result['results'][algorithm] = {}

	# Process each LLM
	for llm in llms:
	llm_data = df[(df['Algorithm'] == algorithm) & (df['LLM'] == llm)]
	if llm_data.empty:
	continue

	# Create dictionary for each LLM
	result['results'][algorithm][llm] = {
	'META': {
	'Algorithm': str(algorithm),
	'LLM': str(llm),
	'Eval Date': str(llm_data['Eval Date'].iloc[0])
	}
	}

	# Process each dataset
	for dataset in df['Dataset'].dropna().unique():
	if not isinstance(dataset, str):
	continue

	dataset_data = llm_data[llm_data['Dataset'] == dataset]

	if not dataset_data.empty:
	data_row = dataset_data.iloc[0]
	result['results'][algorithm][llm][dataset] = {
	'Score': round(float(data_row['Score']) if data_row['Score'] != '-' else 0, 2), # Keep two decimal places
	'Pass rate': round(float(data_row['Pass rate']) / 100, 4) if data_row['Pass rate'] != '-' else 0.0, # Convert to decimal and keep two decimal places
	'Cost($)': float(data_row['Cost($)']) if pd.notnull(data_row['Cost($)']) and data_row['Cost($)'] != '-' else 0.0,
	'Framework': str(data_row['Framework']) if 'Framework' in data_row and pd.notnull(data_row['Framework']) else '',
	'X-shot': str(data_row['X-shot']) if pd.notnull(data_row['X-shot']) else '',
	'Samples': parse_number(data_row['Samples']),
	'All tokens': parse_number(data_row['All tokens']),
	'Total input tokens': parse_number(data_row['Total input tokens']),
	'Average input tokens': parse_number(data_row['Average input tokens']),
	'Total output tokens': parse_number(data_row['Total output tokens']),
	'Average output tokens': parse_number(data_row['Average output tokens'])
	}

	# Check if each field exists
	required_fields = ['Score', 'Pass rate', 'Cost($)', 'Framework', 'X-shot', 'Samples', 'All tokens', 'Total input tokens', 'Average input tokens', 'Total output tokens', 'Average output tokens']

	for key, value in result['results'].items():
	for llm, datasets in value.items():
	# Check META information
	meta = datasets.get('META', {})
	if 'LLM' not in meta or 'Eval Date' not in meta:
	print(f"Missing META fields in algorithm '{key}' for LLM '{llm}'")

	for dataset, data in datasets.items():
	if dataset == 'META':
	continue
	missing_fields = [field for field in required_fields if field not in data]
	if missing_fields:
	print(f"Missing fields {missing_fields} in dataset '{dataset}' for LLM '{llm}' in algorithm '{key}'")

	# Save as JSON file
	with open('src/detail_math_score.json', 'w', encoding='utf-8') as f:
	json.dump(result, f, indent=4, ensure_ascii=False)

	def process_csv_to_overall_json():
	# Read the CSV file
	df = pd.read_csv('src/record.csv')

	# Clean the data: remove empty rows, rename columns
	df = df.dropna(how='all')
	df = df.rename(columns={
	'dataset': 'Dataset',
	'llm': 'LLM',
	'score\n（EM）': 'Score',
	'Cost($)': 'Cost($)',
	'Eval Date': 'Eval Date'
	})

	# Initialize result dictionary
	result = {
	"time": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
	"results": {}
	}

	# Get all unique LLMs
	llms = df['LLM'].dropna().unique()
	for llm in llms:
	# Process base algorithms
	for algorithm in df['Algorithm'].dropna().unique():
	if not isinstance(algorithm, str):
	continue

	# Add suffix for non-gpt-3.5-turbo models
	# Modification: add more information for llama models to ensure uniqueness
	algo_key = algorithm if llm == 'gpt-3.5-turbo' else f"{algorithm}-{llm}"
	# Check if the algorithm-LLM combination exists
	algo_data = df[(df['Algorithm'] == algorithm) & (df['LLM'] == llm)]
	if algo_data.empty:
	print(f"No data found for algorithm '{algorithm}' and LLM '{llm}'")
	continue

	result['results'][algo_key] = {
	"META": {
	"Algorithm": algorithm,
	"LLM": llm,
	"Eval Date": str(algo_data['Eval Date'].iloc[0])
	}
	}

	# Process each dataset
	for dataset in ['gsm8k', 'AQuA', 'MATH-500']:
	dataset_data = df[(df['Algorithm'] == algorithm) &
	(df['Dataset'] == dataset) &
	(df['LLM'] == llm)]
	if not dataset_data.empty:
	result['results'][algo_key][dataset] = {
	"Score": float(dataset_data['Score'].iloc[0]) if pd.notnull(dataset_data['Score'].iloc[0]) and dataset_data['Score'].iloc[0] != '-' else 0.0,
	"Cost($)": float(dataset_data['Cost($)'].iloc[0]) if pd.notnull(dataset_data['Cost($)'].iloc[0]) and dataset_data['Cost($)'].iloc[0] != '-' else 0.0
	}
	else:
	# If the dataset is empty, ensure the key exists and set default values
	result['results'][algo_key][dataset] = {
	"Score": 0.0,
	"Cost($)": 0.0
	}


	# Save as JSON file
	with open('src/overall_math_score.json', 'w', encoding='utf-8') as f:
	json.dump(result, f, indent=4, ensure_ascii=False)

	def process_multi_modal_csv():
	# Read the CSV file
	df = pd.read_csv('src/multi-modal.csv', skipinitialspace=True)

	# Clean and rename columns
	df.columns = df.columns.str.strip().str.replace('="', '').str.replace('"', '')
	df = df.rename(columns={
	'Agent': 'Agent',
	'VLMs': 'VLMs',
	'Score': 'Score',
	'Pass Rate': 'Pass Rate',
	'Total Input Tokens': 'Total Input Tokens',
	'Total Output Tokens': 'Total Output Tokens',
	'All Tokens': 'All Tokens'
	})

	# Strip unwanted characters from all string values
	df = df.applymap(lambda x: str(x).replace('="', '').replace('"', '').strip() if isinstance(x, str) else x)

	# Helper function to parse numbers with commas
	def parse_number(value):
	if pd.isna(value) or value == '-':
	return 0
	return int(float(str(value).replace(',', '')))

	# Process numeric fields
	df['Score'] = df['Score'].apply(lambda x: round(float(x), 2) if pd.notnull(x) and x != '-' else 0.0)
	df['Pass Rate'] = df['Pass Rate'].apply(lambda x: round(float(x) / 100, 4) if pd.notnull(x) and x != '-' else 0.0)
	df['Total Input Tokens'] = df['Total Input Tokens'].apply(parse_number)
	df['Total Output Tokens'] = df['Total Output Tokens'].apply(parse_number)
	df['All Tokens'] = df['All Tokens'].apply(parse_number)

	# Convert to Hugging Face-compatible format
	result = {
	"time": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
	"multi_modal_results": df.to_dict(orient='records')
	}

	# Save as JSON file
	with open('src/multi_modal_results.json', 'w', encoding='utf-8') as f:
	json.dump(result, f, indent=4, ensure_ascii=False)

	if __name__ == "__main__":
	# Generate JSON files in three formats
	process_csv_to_json()
	process_csv_to_overall_json()
	process_multi_modal_csv()