Commit
·
4ee078c
1
Parent(s):
cf62ef7
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,13 +1,14 @@
|
|
| 1 |
"""
|
| 2 |
Baseball statistics application with txtai and Streamlit.
|
| 3 |
|
| 4 |
-
Install txtai and streamlit to run:
|
| 5 |
pip install txtai streamlit
|
| 6 |
"""
|
| 7 |
|
| 8 |
import datetime
|
| 9 |
import os
|
| 10 |
|
|
|
|
| 11 |
import numpy as np
|
| 12 |
import pandas as pd
|
| 13 |
import streamlit as st
|
|
@@ -57,15 +58,12 @@ class Stats:
|
|
| 57 |
|
| 58 |
raise NotImplementedError
|
| 59 |
|
| 60 |
-
def
|
| 61 |
"""
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
Args:
|
| 65 |
-
rows: input DataFrame
|
| 66 |
|
| 67 |
Returns:
|
| 68 |
-
|
| 69 |
"""
|
| 70 |
|
| 71 |
raise NotImplementedError
|
|
@@ -116,30 +114,41 @@ class Stats:
|
|
| 116 |
vectors = {f'{row["yearID"]}{row["playerID"]}': self.transform(row) for _, row in self.stats.iterrows()}
|
| 117 |
data = {f'{row["yearID"]}{row["playerID"]}': dict(row) for _, row in self.stats.iterrows()}
|
| 118 |
|
| 119 |
-
embeddings = Embeddings(
|
| 120 |
-
|
| 121 |
-
|
|
|
|
|
|
|
| 122 |
|
| 123 |
embeddings.index((uid, vectors[uid], None) for uid in vectors)
|
| 124 |
|
| 125 |
return vectors, data, embeddings
|
| 126 |
|
| 127 |
-
def
|
| 128 |
"""
|
| 129 |
-
Looks up
|
| 130 |
|
| 131 |
Args:
|
| 132 |
player: player name
|
| 133 |
|
| 134 |
Returns:
|
| 135 |
-
|
| 136 |
"""
|
| 137 |
|
| 138 |
if player in self.names:
|
| 139 |
-
|
| 140 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 141 |
|
| 142 |
-
|
|
|
|
|
|
|
|
|
|
| 143 |
|
| 144 |
def search(self, player=None, year=None, row=None, limit=10):
|
| 145 |
"""
|
|
@@ -196,10 +205,42 @@ class Stats:
|
|
| 196 |
|
| 197 |
|
| 198 |
class Batting(Stats):
|
|
|
|
|
|
|
|
|
|
|
|
|
| 199 |
def loadcolumns(self):
|
| 200 |
return [
|
| 201 |
-
"birthMonth",
|
| 202 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 203 |
]
|
| 204 |
|
| 205 |
def load(self):
|
|
@@ -230,8 +271,8 @@ class Batting(Stats):
|
|
| 230 |
|
| 231 |
return batting
|
| 232 |
|
| 233 |
-
def
|
| 234 |
-
return
|
| 235 |
|
| 236 |
def vector(self, row):
|
| 237 |
row["TB"] = row["1B"] + 2 * row["2B"] + 3 * row["3B"] + 4 * row["HR"]
|
|
@@ -255,7 +296,7 @@ class Batting(Stats):
|
|
| 255 |
"""
|
| 256 |
|
| 257 |
positions = {}
|
| 258 |
-
for
|
| 259 |
uid = f'{row["yearID"]}{row["playerID"]}'
|
| 260 |
position = row["POS"] if row["POS"] else 0
|
| 261 |
if position == "P":
|
|
@@ -294,12 +335,46 @@ class Batting(Stats):
|
|
| 294 |
uid = f'{row["yearID"]}{row["playerID"]}'
|
| 295 |
return positions[uid][0] if uid in positions else 0
|
| 296 |
|
|
|
|
| 297 |
class Pitching(Stats):
|
|
|
|
|
|
|
|
|
|
|
|
|
| 298 |
def loadcolumns(self):
|
| 299 |
return [
|
| 300 |
-
"birthMonth",
|
| 301 |
-
"
|
| 302 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 303 |
]
|
| 304 |
|
| 305 |
def load(self):
|
|
@@ -316,16 +391,16 @@ class Pitching(Stats):
|
|
| 316 |
# Calculated columns
|
| 317 |
pitching["age"] = pitching["yearID"] - pitching["birthYear"]
|
| 318 |
pitching["WHIP"] = (pitching["BB"] + pitching["H"]) / (pitching["IPouts"] / 3)
|
| 319 |
-
pitching["WADJ"] =(pitching["W"] + pitching["SV"]) / (pitching["ERA"] + pitching["WHIP"])
|
| 320 |
|
| 321 |
return pitching
|
| 322 |
|
| 323 |
-
def
|
| 324 |
-
return
|
| 325 |
|
| 326 |
def vector(self, row):
|
| 327 |
row["WHIP"] = (row["BB"] + row["H"]) / (row["IPouts"] / 3) if row["IPouts"] else None
|
| 328 |
-
row["WADJ"] =(row["W"] + row["SV"]) / (row["ERA"] + row["WHIP"]) if row["ERA"] and row["WHIP"] else None
|
| 329 |
|
| 330 |
return self.transform(row)
|
| 331 |
|
|
@@ -352,13 +427,15 @@ class Application:
|
|
| 352 |
"""
|
| 353 |
|
| 354 |
st.title("⚾ Baseball Statistics")
|
| 355 |
-
st.markdown(
|
|
|
|
| 356 |
This application finds the best matching historical players using vector search with [txtai](https://github.com/neuml/txtai).
|
| 357 |
Raw data is from the [Baseball Databank](https://github.com/chadwickbureau/baseballdatabank) GitHub project.
|
| 358 |
-
"""
|
|
|
|
| 359 |
|
| 360 |
self.player()
|
| 361 |
-
|
| 362 |
def player(self):
|
| 363 |
"""
|
| 364 |
Player tab.
|
|
@@ -373,19 +450,59 @@ class Application:
|
|
| 373 |
names = sorted(stats.names)
|
| 374 |
player = st.selectbox("Player", names, names.index(default))
|
| 375 |
|
|
|
|
|
|
|
|
|
|
| 376 |
# Player year
|
| 377 |
-
|
| 378 |
-
|
|
|
|
|
|
|
|
|
|
| 379 |
|
| 380 |
# Run search
|
| 381 |
results = stats.search(player, year)
|
| 382 |
|
| 383 |
# Display results
|
| 384 |
-
self.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 385 |
|
| 386 |
-
def
|
| 387 |
"""
|
| 388 |
-
Displays a list of results.
|
| 389 |
|
| 390 |
Args:
|
| 391 |
results: list of results
|
|
|
|
| 1 |
"""
|
| 2 |
Baseball statistics application with txtai and Streamlit.
|
| 3 |
|
| 4 |
+
Install txtai and streamlit (>= 1.23) to run:
|
| 5 |
pip install txtai streamlit
|
| 6 |
"""
|
| 7 |
|
| 8 |
import datetime
|
| 9 |
import os
|
| 10 |
|
| 11 |
+
import altair as alt
|
| 12 |
import numpy as np
|
| 13 |
import pandas as pd
|
| 14 |
import streamlit as st
|
|
|
|
| 58 |
|
| 59 |
raise NotImplementedError
|
| 60 |
|
| 61 |
+
def metric(self):
|
| 62 |
"""
|
| 63 |
+
Primary metric column.
|
|
|
|
|
|
|
|
|
|
| 64 |
|
| 65 |
Returns:
|
| 66 |
+
metric column name
|
| 67 |
"""
|
| 68 |
|
| 69 |
raise NotImplementedError
|
|
|
|
| 114 |
vectors = {f'{row["yearID"]}{row["playerID"]}': self.transform(row) for _, row in self.stats.iterrows()}
|
| 115 |
data = {f'{row["yearID"]}{row["playerID"]}': dict(row) for _, row in self.stats.iterrows()}
|
| 116 |
|
| 117 |
+
embeddings = Embeddings(
|
| 118 |
+
{
|
| 119 |
+
"transform": self.transform,
|
| 120 |
+
}
|
| 121 |
+
)
|
| 122 |
|
| 123 |
embeddings.index((uid, vectors[uid], None) for uid in vectors)
|
| 124 |
|
| 125 |
return vectors, data, embeddings
|
| 126 |
|
| 127 |
+
def metrics(self, player):
|
| 128 |
"""
|
| 129 |
+
Looks up a player's active years, best statistical year and key metrics.
|
| 130 |
|
| 131 |
Args:
|
| 132 |
player: player name
|
| 133 |
|
| 134 |
Returns:
|
| 135 |
+
active, best, metrics
|
| 136 |
"""
|
| 137 |
|
| 138 |
if player in self.names:
|
| 139 |
+
# Get player stats
|
| 140 |
+
stats = self.stats[self.stats["playerID"] == self.names[player]]
|
| 141 |
+
|
| 142 |
+
# Build key metrics
|
| 143 |
+
metrics = stats[["yearID", self.metric()]]
|
| 144 |
+
|
| 145 |
+
# Get best year, sort by primary metric
|
| 146 |
+
best = int(stats.sort_values(by=self.metric(), ascending=False)["yearID"].iloc[0])
|
| 147 |
|
| 148 |
+
# Get years active, best year, along with metric trends
|
| 149 |
+
return metrics["yearID"].tolist(), best, metrics
|
| 150 |
+
|
| 151 |
+
return range(1871, datetime.datetime.today().year), 1950, None
|
| 152 |
|
| 153 |
def search(self, player=None, year=None, row=None, limit=10):
|
| 154 |
"""
|
|
|
|
| 205 |
|
| 206 |
|
| 207 |
class Batting(Stats):
|
| 208 |
+
"""
|
| 209 |
+
Batting stats.
|
| 210 |
+
"""
|
| 211 |
+
|
| 212 |
def loadcolumns(self):
|
| 213 |
return [
|
| 214 |
+
"birthMonth",
|
| 215 |
+
"yearID",
|
| 216 |
+
"age",
|
| 217 |
+
"height",
|
| 218 |
+
"weight",
|
| 219 |
+
"G",
|
| 220 |
+
"AB",
|
| 221 |
+
"R",
|
| 222 |
+
"H",
|
| 223 |
+
"1B",
|
| 224 |
+
"2B",
|
| 225 |
+
"3B",
|
| 226 |
+
"HR",
|
| 227 |
+
"RBI",
|
| 228 |
+
"SB",
|
| 229 |
+
"CS",
|
| 230 |
+
"BB",
|
| 231 |
+
"SO",
|
| 232 |
+
"IBB",
|
| 233 |
+
"HBP",
|
| 234 |
+
"SH",
|
| 235 |
+
"SF",
|
| 236 |
+
"GIDP",
|
| 237 |
+
"POS",
|
| 238 |
+
"AVG",
|
| 239 |
+
"OBP",
|
| 240 |
+
"TB",
|
| 241 |
+
"SLG",
|
| 242 |
+
"OPS",
|
| 243 |
+
"OPS+",
|
| 244 |
]
|
| 245 |
|
| 246 |
def load(self):
|
|
|
|
| 271 |
|
| 272 |
return batting
|
| 273 |
|
| 274 |
+
def metric(self):
|
| 275 |
+
return "OPS+"
|
| 276 |
|
| 277 |
def vector(self, row):
|
| 278 |
row["TB"] = row["1B"] + 2 * row["2B"] + 3 * row["3B"] + 4 * row["HR"]
|
|
|
|
| 296 |
"""
|
| 297 |
|
| 298 |
positions = {}
|
| 299 |
+
for _, row in fielding.iterrows():
|
| 300 |
uid = f'{row["yearID"]}{row["playerID"]}'
|
| 301 |
position = row["POS"] if row["POS"] else 0
|
| 302 |
if position == "P":
|
|
|
|
| 335 |
uid = f'{row["yearID"]}{row["playerID"]}'
|
| 336 |
return positions[uid][0] if uid in positions else 0
|
| 337 |
|
| 338 |
+
|
| 339 |
class Pitching(Stats):
|
| 340 |
+
"""
|
| 341 |
+
Pitching stats.
|
| 342 |
+
"""
|
| 343 |
+
|
| 344 |
def loadcolumns(self):
|
| 345 |
return [
|
| 346 |
+
"birthMonth",
|
| 347 |
+
"yearID",
|
| 348 |
+
"age",
|
| 349 |
+
"height",
|
| 350 |
+
"weight",
|
| 351 |
+
"W",
|
| 352 |
+
"L",
|
| 353 |
+
"G",
|
| 354 |
+
"GS",
|
| 355 |
+
"CG",
|
| 356 |
+
"SHO",
|
| 357 |
+
"SV",
|
| 358 |
+
"IPouts",
|
| 359 |
+
"H",
|
| 360 |
+
"ER",
|
| 361 |
+
"HR",
|
| 362 |
+
"BB",
|
| 363 |
+
"SO",
|
| 364 |
+
"BAOpp",
|
| 365 |
+
"ERA",
|
| 366 |
+
"IBB",
|
| 367 |
+
"WP",
|
| 368 |
+
"HBP",
|
| 369 |
+
"BK",
|
| 370 |
+
"BFP",
|
| 371 |
+
"GF",
|
| 372 |
+
"R",
|
| 373 |
+
"SH",
|
| 374 |
+
"SF",
|
| 375 |
+
"GIDP",
|
| 376 |
+
"WHIP",
|
| 377 |
+
"WADJ",
|
| 378 |
]
|
| 379 |
|
| 380 |
def load(self):
|
|
|
|
| 391 |
# Calculated columns
|
| 392 |
pitching["age"] = pitching["yearID"] - pitching["birthYear"]
|
| 393 |
pitching["WHIP"] = (pitching["BB"] + pitching["H"]) / (pitching["IPouts"] / 3)
|
| 394 |
+
pitching["WADJ"] = (pitching["W"] + pitching["SV"]) / (pitching["ERA"] + pitching["WHIP"])
|
| 395 |
|
| 396 |
return pitching
|
| 397 |
|
| 398 |
+
def metric(self):
|
| 399 |
+
return "WADJ"
|
| 400 |
|
| 401 |
def vector(self, row):
|
| 402 |
row["WHIP"] = (row["BB"] + row["H"]) / (row["IPouts"] / 3) if row["IPouts"] else None
|
| 403 |
+
row["WADJ"] = (row["W"] + row["SV"]) / (row["ERA"] + row["WHIP"]) if row["ERA"] and row["WHIP"] else None
|
| 404 |
|
| 405 |
return self.transform(row)
|
| 406 |
|
|
|
|
| 427 |
"""
|
| 428 |
|
| 429 |
st.title("⚾ Baseball Statistics")
|
| 430 |
+
st.markdown(
|
| 431 |
+
"""
|
| 432 |
This application finds the best matching historical players using vector search with [txtai](https://github.com/neuml/txtai).
|
| 433 |
Raw data is from the [Baseball Databank](https://github.com/chadwickbureau/baseballdatabank) GitHub project.
|
| 434 |
+
"""
|
| 435 |
+
)
|
| 436 |
|
| 437 |
self.player()
|
| 438 |
+
|
| 439 |
def player(self):
|
| 440 |
"""
|
| 441 |
Player tab.
|
|
|
|
| 450 |
names = sorted(stats.names)
|
| 451 |
player = st.selectbox("Player", names, names.index(default))
|
| 452 |
|
| 453 |
+
# Player metrics
|
| 454 |
+
active, best, metrics = stats.metrics(player)
|
| 455 |
+
|
| 456 |
# Player year
|
| 457 |
+
year = int(st.select_slider("Year", active, best) if len(active) > 1 else active[0])
|
| 458 |
+
|
| 459 |
+
# Display metrics chart
|
| 460 |
+
if len(active) > 1:
|
| 461 |
+
self.chart(category, metrics)
|
| 462 |
|
| 463 |
# Run search
|
| 464 |
results = stats.search(player, year)
|
| 465 |
|
| 466 |
# Display results
|
| 467 |
+
self.table(results, ["nameFirst", "nameLast", "teamID"] + stats.columns[1:] + ["link"])
|
| 468 |
+
|
| 469 |
+
def chart(self, category, metrics):
|
| 470 |
+
"""
|
| 471 |
+
Displays a metric chart.
|
| 472 |
+
|
| 473 |
+
Args:
|
| 474 |
+
category: Batting or Pitching
|
| 475 |
+
metrics: player metrics to plot
|
| 476 |
+
"""
|
| 477 |
+
|
| 478 |
+
# Key metric
|
| 479 |
+
metric = self.batting.metric() if category == "Batting" else self.pitching.metric()
|
| 480 |
+
|
| 481 |
+
# Cast year to string
|
| 482 |
+
metrics["yearID"] = metrics["yearID"].astype(str)
|
| 483 |
+
|
| 484 |
+
# Metric over years
|
| 485 |
+
chart = (
|
| 486 |
+
alt.Chart(metrics)
|
| 487 |
+
.mark_line(interpolate="monotone", point=True, strokeWidth=2.5, opacity=0.75)
|
| 488 |
+
.encode(
|
| 489 |
+
x=alt.X("yearID", title="").scale(padding=0),
|
| 490 |
+
y=alt.Y(metric).scale(zero=False, padding=0),
|
| 491 |
+
)
|
| 492 |
+
)
|
| 493 |
+
|
| 494 |
+
# Create metric median rule line
|
| 495 |
+
rule = alt.Chart(metrics).mark_rule(color="gray", strokeDash=[3, 5], opacity=0.5).encode(y=f"median({metric})")
|
| 496 |
+
|
| 497 |
+
# Layered chart configuration
|
| 498 |
+
chart = (chart + rule).encode(y=alt.Y(title=metric)).properties(height=200).configure_axis(grid=False)
|
| 499 |
+
|
| 500 |
+
# Draw chart
|
| 501 |
+
st.altair_chart(chart + rule, theme="streamlit", use_container_width=True)
|
| 502 |
|
| 503 |
+
def table(self, results, columns):
|
| 504 |
"""
|
| 505 |
+
Displays a list of results as a table.
|
| 506 |
|
| 507 |
Args:
|
| 508 |
results: list of results
|