## Reading Data from baseball-reference.com using BeautifulSoup in Python

### Read Player Standard Batting table from baseball-reference.com

In [1]:
import pandas as pd
import os.path
import requests
from bs4 import BeautifulSoup

def getStandardBatting(league, year):
    csvfile = league + '_' + str(year) + '-standard-batting.csv'
    if os.path.isfile(csvfile):
        dd = pd.read_csv(csvfile)
    else:
        r = requests.get("https://www.baseball-reference.com/leagues/"+league+"/"+str(year)+"-standard-batting.shtml")
        soup = BeautifulSoup(r.content, "html.parser") # try lxml
        div = soup.find('div', id='all_players_standard_batting')
        bdiv = bytearray(str(div.contents),'utf-8')
        soup2 = BeautifulSoup(bdiv, "html.parser")
        tbody = soup2.find('tbody')
        rows = tbody.find_all('tr')
        columns = ['Name','Age','Tm','G','PA','AB','R','H','2B','3B','HR','RBI','SB','CS','BB','SO',
                   'BA','OBP','SLG','OPS','OPS+','TB','GDP','HBP','SH','SF','IBB','Pos Summary']
        dd = pd.DataFrame(columns = columns)
        irow = 0
        for row in rows:
            cols = row.find_all('td')
            if (len(cols) >= 28):
                dd.loc[irow] = [
                    cols[0].text.strip(),
                    cols[1].text.strip(),
                    cols[2].text.strip(),
                    cols[3].text.strip(),
                    cols[4].text.strip(),
                    cols[5].text.strip(),
                    cols[6].text.strip(),
                    cols[7].text.strip(),
                    cols[8].text.strip(),
                    cols[9].text.strip(),
                    cols[10].text.strip(),
                    cols[11].text.strip(),
                    cols[12].text.strip(),
                    cols[13].text.strip(),
                    cols[14].text.strip(),
                    cols[15].text.strip(),
                    cols[16].text.strip(),
                    cols[17].text.strip(),
                    cols[18].text.strip(),
                    cols[19].text.strip(),
                    cols[20].text.strip(),
                    cols[21].text.strip(),
                    cols[22].text.strip(),
                    cols[23].text.strip(),
                    cols[24].text.strip(),
                    cols[25].text.strip(),
                    cols[26].text.strip(),
                    cols[27].text.strip()
                ]
                irow = irow+1
        dd.index += 1
        dd.to_csv(csvfile)
    return dd

nl17 = getStandardBatting('NL', 2017)
al17 = getStandardBatting('AL', 2017)
print(nl17)
print(al17)
print(nl17.iloc[0,:])
print(al17.iloc[0,:])

     Unnamed: 0                  Name   Age   Tm    G   PA   AB    R    H  2B  \
0             1     Cristhian Adames#  25.0  COL   12   14   13    1    0   0   
1             2          Austin Adams  26.0  WSN    6    0    0    0    0   0   
2             3            Lane Adams  27.0  ATL   85  122  109   19   30   4   
3             4           Matt Adams*  28.0  TOT  131  367  339   46   93  22   
4             5           Matt Adams*  28.0  STL   31   53   48    4   14   2   
5             6           Matt Adams*  28.0  ATL  100  314  291   42   79  20   
6             7           Tim Adleman  29.0  CIN   27   32   29    0    3   1   
7             8         Jesus Aguilar  27.0  MIL  133  311  279   40   74  15   
8             9            Nick Ahmed  27.0  ARI   53  178  167   24   42   8   
9            10          Matt Albers*  34.0  WSN   59    0    0    0    0   0   
10           11         Ozzie Albies#  20.0  ATL   57  244  217   34   62   9   
11           12  Arismendy A

### Read Player Starter Pitching table from baseball-reference.com

In [2]:
import pandas as pd
import os.path
import requests
from bs4 import BeautifulSoup

def getStarterPitching(league, year):
    csvfile = league + '_' + str(year) + '-starter-pitching.csv'
    if os.path.isfile(csvfile):
        dd = pd.read_csv(csvfile)
    else:
        r = requests.get("https://www.baseball-reference.com/leagues/"+league+"/"+str(year)+"-starter-pitching.shtml")
        soup = BeautifulSoup(r.content, "html.parser") # try lxml
        div = soup.find('div', id='all_players_starter_pitching')
        bdiv = bytearray(str(div.contents),'utf-8')
        soup2 = BeautifulSoup(bdiv, "html.parser")
        tbody = soup2.find('tbody')
        rows = tbody.find_all('tr')
        columns = ['Name','Age','Tm','IP','G','GS','Wgs','Lgs','ND','Wchp','Ltuf','Wtm','Ltm','tmW-L%','Wist','Lsv',
                   'CG','SHO','QS','QS%','GmScA','Best','Wrst','BQR','BQS','sDR','IDR','RS/GS','RS/IP','IP/GS',
                   'Pit/GS','<80','80-99','100-119','≥120','Max']
        dd = pd.DataFrame(columns = columns)
        irow = 0
        for row in rows:
            cols = row.find_all('td')
            if (len(cols) >= 28):
                dd.loc[irow] = [
                    cols[0].text.strip(),
                    cols[1].text.strip(),
                    cols[2].text.strip(),
                    cols[3].text.strip(),
                    cols[4].text.strip(),
                    cols[5].text.strip(),
                    cols[6].text.strip(),
                    cols[7].text.strip(),
                    cols[8].text.strip(),
                    cols[9].text.strip(),
                    cols[10].text.strip(),
                    cols[11].text.strip(),
                    cols[12].text.strip(),
                    cols[13].text.strip(),
                    cols[14].text.strip(),
                    cols[15].text.strip(),
                    cols[16].text.strip(),
                    cols[17].text.strip(),
                    cols[18].text.strip(),
                    cols[19].text.strip(),
                    cols[20].text.strip(),
                    cols[21].text.strip(),
                    cols[22].text.strip(),
                    cols[23].text.strip(),
                    cols[24].text.strip(),
                    cols[25].text.strip(),
                    cols[26].text.strip(),
                    cols[27].text.strip(),
                    cols[28].text.strip(),
                    cols[29].text.strip(),
                    cols[30].text.strip(),
                    cols[31].text.strip(),
                    cols[32].text.strip(),
                    cols[33].text.strip(),
                    cols[34].text.strip(),
                    cols[35].text.strip()
                ]
                irow = irow+1
        dd.index += 1
        dd.to_csv(csvfile)
    return dd

nl17 = getStarterPitching('NL', 2017)
al17 = getStarterPitching('AL', 2017)
print(nl17)
print(al17)
print(nl17.iloc[0,:])
print(al17.iloc[0,:])

     Unnamed: 0                 Name  Age   Tm     IP   G  GS  Wgs  Lgs  ND  \
0             1          Tim Adleman   29  CIN  122.1  30  20    5    9   6   
1             2    Henderson Alvarez   27  PHI   14.2   3   3    0    1   2   
2             3      Brett Anderson*   29  CHC   22.0   6   6    2    2   2   
3             4       Chase Anderson   29  MIL  141.1  25  25   12    4   9   
4             5      Tyler Anderson*   27  COL   86.0  17  15    5    6   4   
5             6         Jake Arrieta   31  CHC  168.1  30  30   14   10   6   
6             7       Bronson Arroyo   40  CIN   71.0  14  14    3    6   5   
7             8         Homer Bailey   31  CIN   91.0  18  18    6    9   3   
8             9       Anthony Banda*   23  ARI   25.2   8   4    1    3   0   
9            10          Chad Bettis   28  COL   46.1   9   9    2    4   3   
10           11            Ty Blach*   26  SFG  163.2  34  24    8   12   4   
11           12          Aaron Blair   25  ATL    3.