Web Scraping with Python

This handout will discuss how to scrape data from a website using the BeautifulSoup package in Python.

Website to Scrape – Darr Auction Site
Address:

The appearance of the website via the Chrome web browser.

A portion of the HTML contents of this website is provided here.

Load the requests and BeautifulSoup packages.

import requests

from bs4 import BeautifulSoup

Make a request to webserver for contents of the specified webpage.

r = requests.get("

Inspect the contents of r using r.content.

r.content

A snipit of the contents of r

Using BeautifulSoup to cleanup this string a bit.

soup = BeautifulSoup(r.content)

Taking a look at soup

The prettify() function with BeautifulSoup will cleanup the string a bit more so that the string can be more easily understood.

print(soup.prettify())

print(soup) / print( soup.prettify() )
import requests
from bs4 import BeautifulSoup
#Go get webpage
r = requests.get("
#See the contents of webpage
r.content
#Putting page through BeautifulSoup
soup = BeautifulSoup(r.content)
soup
#Makes for pretty print
print(soup.prettify())

Extract all links

#Find all links
soup.find_all("a")
#Print all links
for link in soup.find_all("a"):
print(link.get("href"))
#Text of all links
for link in soup.find_all("a"):
print(link.text)
#Making list of all links -- pretty
for link in soup.find_all("a"):
print("<a href='%s'>%s</a>" %(link.get("href"), link.text))

Putting links into a DataFrame

from pandas import Series, DataFrame
import pandas as pd
import numpy as np
#Putting links into a list names data
data=[]
for link in soup.find_all("a"):
data.append(link.get("href"))
#Converting the list to DataFrame
data1 = DataFrame(data, columns=['Links'])

Extracting only the data

First, we need to understand structure of HTML contents of document, in Google Chrome, right click and select Inspect element.

Understanding elements of document

Extracting the rows

#All data is in class DataRow
#Narrow grab to just DataRow class
mydata = soup.find_all("tr", {"class": "DataRow"})

Code for various columns

Actual table

Code to rip each column

#Rip each column
for row in mydata:
tds = row.find_all("td")
print(tds[0].get_text())

Additional Code to summarize data

dataid = []
datadescription = []
databids = []
datahighbidder = []
datacurrentamount = []
#Rip each column
for row in mydata:
tds = row.find_all("td")
#print(tds[0].get_text(), tds[2].get_text(),tds[3].get_text(),tds[4].get_text(),tds[5].get_text(),tds[7].get_text())
dataid.append(tds[0].get_text())
datadescription.append(tds[2].get_text())
databids.append(tds[3].get_text())
datahighbidder.append(tds[4].get_text())
datacurrentamount.append(tds[5].get_text())
data2 = DataFrame(dataid,columns=['ID'])
data2['Description'] = datadescription
data2['Bids'] = databids
data2['HighBidder'] = datahighbidder
data2['Amount'] = datacurrentamount
#Try to get amount spent by each bidder
data2.pivot_table('Amount',rows='HighBidder',aggfunc = sum)
#Check data typees
data2.dtypes
#Convert data types
data2.convert_objects(convert_numeric=True).dtypes
data2['AmountNumber']=data2[['Amount']].astype(float)
data2.dtypes
data2.pivot_table('Amount',rows='HighBidder',aggfunc = len)
data3=data2.pivot_table('AmountNumber',rows='HighBidder',aggfunc = sum)
data4 = data3.order(ascending=False)
#Top 5 spenders
data4[:5]
#Spent over 100
data4[data4 > 100]