With access to decades worth of data including player statistics like school attended, salaries, teams played for, height and weight, and more. using advanced SQL querying techniques to track how player statistics have changed over time and across different teams in the league.

  • In each decade, how many schools were there that produced MLB players?
  • What are the names of the top 5 schools that produced the most players?
  • For each decade, what were the names of the top 3 schools that produced the most players?
SELECT * FROM [dbo].[schools];
SELECT * FROM [dbo].[school_details];

-- how many player in each decade 

SELECT ROUND (yearID , -1) AS Decade , COUNT(DISTINCT schoolID ) AS NUM_SCHOOLS
FROM schools 
GROUP BY  ROUND (yearID , -1)
ORDER BY Decade;

-- top 5 schools that produced most player 

SELECT TOP 5 SD.name_full, COUNT(DISTINCT S.playerID ) AS NUM_PLAYERS 
FROM schools S LEFT JOIN school_details SD 
ON S.schoolID=SD.schoolID
GROUP BY SD.name_full
ORDER BY NUM_PLAYERS DESC ; 

-- for each decade name of the top 3 schools that produced most players 

WITH SD AS (SELECT ROUND (S.yearID , -1)AS Decade ,  SD.name_full, COUNT(DISTINCT S.playerID ) AS NUM_PLAYERS 
FROM schools S LEFT JOIN school_details SD 
ON S.schoolID=SD.schoolID
GROUP BY ROUND (S.yearID , -1) , SD.name_full
),

RN AS (SELECT Decade,name_full,NUM_PLAYERS,
   ROW_NUMBER() OVER(PARTITION BY Decade ORDER BY NUM_PLAYERS DESC) AS ROW_NUM
FROM SD)

SELECT Decade,name_full,NUM_PLAYERS 
FROM RN 
WHERE ROW_NUM <= 3 
ORDER BY Decade DESC , ROW_NUM; 
  • Return the top 20% of teams in terms of average annual spending
  • For each team, show the cumulative sum of spending over the years
  • Return the first year that each team’s cumulative spending surpassed 1 billion
SELECT * FROM [dbo].[salaries];
-- return the top 20% of teams in term of average annual spending 

WITH TS AS (SELECT teamID , yearID , SUM(CAST (salary AS bigint)) AS TOTAL_SPEND 
			FROM [dbo].[salaries]
			GROUP BY teamID , yearID
			),
SP AS		(SELECT teamID , AVG(TOTAL_SPEND) AS AVG_SPEND , 
			NTILE(5) OVER (ORDER BY AVG(TOTAL_SPEND) DESC ) AS SPEND_PCT 
			FROM TS 
			GROUP BY teamID)

SELECT teamID , ROUND(AVG_SPEND / 1000000 , 1 ) AS AVG_SPEND_MILLIONS  FROM SP 
WHERE SPEND_PCT=1; 

-- for each team calculate the cumulative sum of spendinig over the years 
WITH TS AS (SELECT teamID, yearID, SUM(CAST (salary AS BIGINT)) AS TOTAL_SPEND 
		FROM salaries
		GROUP BY teamID , yearID)

SELECT teamID , yearID , TOTAL_SPEND/1000000  AS TOTAL_SPEND, 
		ROUND(SUM(TOTAL_SPEND) OVER( PARTITION BY teamID ORDER BY yearID) / 1000000,1 ) AS CUMULATIVE_SUM_MILIONS
FROM TS ;

-- return the first year that each team cumulative spending surpassed 1 bilion 
WITH TS AS (SELECT teamID, yearID, SUM(CAST (salary AS BIGINT)) AS TOTAL_SPEND 
		FROM salaries
		GROUP BY teamID , yearID),

CS AS (SELECT teamID , yearID ,  
	SUM(TOTAL_SPEND) OVER( PARTITION BY teamID ORDER BY yearID) AS CUMULATIVE_SUM
    FROM TS ),

RN AS (SELECT teamID,yearID,CUMULATIVE_SUM ,
		ROW_NUMBER() OVER(PARTITION BY teamID ORDER BY CUMULATIVE_SUM ) AS RN 
		FROM CS 
		WHERE CUMULATIVE_SUM > 1000000000)

SELECT teamID , yearID , CUMULATIVE_SUM
FROM RN 
WHERE RN=1;
  • For each player, calculate their age at their first (debut) game, their last game, and their career length (all in years). Sort from longest career to shortest career.
  • What team did each player play on for their starting and ending years?
  • How many players started and ended on the same team and also played for over a decade?
-- view player table and number of player 
SELECT * FROM players ; 
SELECT COUNT(playerID) FROM players;

-- for each player calculate their age at their first game and their last game 
SELECT nameGiven,debut,finalGame,
		TRY_CAST(CONCAT(birthYear,'-',
		RIGHT('00'+ CAST(birthMonth AS VARCHAR),2),'-',
		RIGHT('00'+ CAST(birthDay AS VARCHAR),2))AS DATE ) AS BIRTHDATE,
		DATEDIFF(YEAR,TRY_CAST(CONCAT(birthYear,'-',birthMonth,'-',birthDay) AS DATE), debut ) AS STARTAGE ,
		DATEDIFF(YEAR,TRY_CAST(CONCAT(birthYear,'-',birthMonth,'-',birthDay) AS DATE), finalGame ) AS ENDAGE , 
		DATEDIFF(YEAR , debut , finalGame ) AS CAREER_LENGHT
FROM players
ORDER BY CAREER_LENGHT DESC ;

-- what team did each player play on for their starting and ending years ? 
SELECT P.nameGiven,
	   S.yearID AS STARTING_YEAR ,S.teamID AS STARTING_TEAM,
	   E.yearID AS ENDING_YEAR ,E.teamID AS ENDING_TEAM
FROM players P INNER JOIN salaries S 
               ON P.playerID=S.playerID AND
			   YEAR(P.debut) =S.yearID
			   
			   INNER JOIN salaries E 
               ON P.playerID=E.playerID AND
			   YEAR(P.finalGame) =E.yearID
			   
WHERE S.teamID = E.teamID AND E.yearID - S.yearID > 10 ;
  • Which players have the same birthday?
  • Create a summary table that shows for each team, what percent of players bat right, left and both.
  • How have average height and weight at debut game changed over the years, and what’s the decade-over-decade difference?
-- which player have the same birthday 
WITH BN AS (SELECT TRY_CAST(CONCAT(birthYear,'-',birthMonth,'-',birthDay) AS DATE) AS BIRTHDATE,
		nameGiven
        FROM players)

SELECT BIRTHDATE , STRING_AGG(nameGiven , '- ') AS PLAYERS , COUNT(nameGiven)
FROM 
BN 
WHERE BIRTHDATE IS NOT NULL AND YEAR(BIRTHDATE) BETWEEN 1980 AND 1990
GROUP BY BIRTHDATE
HAVING COUNT(nameGiven) > 2
ORDER BY BIRTHDATE
;

-- SUMMARY TABLE 
SELECT s.teamID,COUNT(s.playerID) AS NUM_PLAYERS, 
		ROUND(CAST(SUM(CASE WHEN P.bats='R' THEN 1 ELSE 0 END)  AS FLOAT)/ COUNT(s.playerID) *100 ,1)AS BATS_RIGHT, 
		ROUND(CAST(SUM(CASE WHEN P.bats= 'L' THEN 1 ELSE 0 END) AS FLOAT)/ COUNT(s.playerID) *100 , 1) AS BATS_LEFT,
		ROUND(CAST(SUM(CASE WHEN P.bats= 'B' THEN 1 ELSE 0 END) AS FLOAT) / COUNT(s.playerID) *100 , 1) AS BATS_BOTH
FROM salaries S LEFT JOIN players P 
ON S.playerID = p.playerID
GROUP BY s.teamID ;

-- average height and weight at debut game changed over the years 

WITH HW AS (SELECT ROUND(YEAR(debut),-1) AS DECADE ,AVG(height) AS AVG_HEIGHT , AVG(weight) AS AVG_WEIGHT 
FROM players
GROUP BY ROUND(YEAR(debut),-1) )

SELECT  DECADE,
        AVG_HEIGHT - LAG(AVG_HEIGHT) OVER(ORDER BY DECADE) AS HEIGHT_DIFF,
		AVG_WEIGHT - LAG(AVG_WEIGHT) OVER(ORDER BY DECADE) AS WEIGHT_DIFF
FROM HW
WHERE DECADE IS NOT NULL 
;
Scroll to Top