Obtaining Images & Data

In [69]:
import pandas as pd

# Web scraping
import requests
import xmltodict
import re # Regular Expressions
from bs4 import BeautifulSoup
import urllib

url = "http://www.the-athenaeum.org/art/list.php?s=tu&m=a&aid=13&p=2"

page = requests.get(url)

soup = BeautifulSoup(page.text)
data = []

for page in range(1,14):
    # Get page from site
    url = "http://www.the-athenaeum.org/art/list.php?s=tu&m=a&aid=13&p=" + str(page)
    page = requests.get(url)
    soup = BeautifulSoup(page.text)
    
    # Find all the rows in the data table
    art_table_rows = soup.findAll('tr', {'class': re.compile("r1|r2")})

    for row in art_table_rows:

        # Save description text for good luck
        desc = row.findAll('td')[1].get_text()

        # Artwork title
        title = row.find('div', {'class': 'list_title'}).get_text()

        # Year(s) created (may be a range, will deal with that later)
        p = re.compile('\((\d*|\d*-\d*)\)')
        m = p.search(desc)
        if (m):
            year = m.groups()[0]
        else:
            year = []

        # Media art was created in (if listed)
        p = re.compile('(pastel|oil|tempera)')
        m = p.search(desc)
        if(m): 
            medium = m.groups()[0]
        else:
            medium = []

        # Save image files
        img_link = row.find('img', {'vspace': '4'}).get('src')
        if (img_link):
            file_id_thumb = int(float(img_link.replace('display_image.php?id=','')))
            file_id_full = file_id_thumb + 1
            out = urllib.urlretrieve(image_base_url + str(file_id_thumb), 'images/' + str(file_id_thumb) +'.jpg')
            out2 = urllib.urlretrieve(image_base_url + str(file_id_full), 'images/' + str(file_id_full) +'.jpg')

        data.append({'title': title, 'desc': desc, 'year': year, 'medium': medium, \
                    'file_id_thumb': file_id_thumb, 'file_id_full': file_id_full})

data = pd.DataFrame(data)
In [70]:
data.to_csv("monet_data.csv", index=False, encoding='utf-8')