import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as dates
%matplotlib inline
#Loads the CSV file (make sure that it is in the same folder as the notebook)
cols = ["id", "type", "newspaper", "date", "url", "category", "photoillustration"]
df = pd.read_csv("merged.csv", sep=';', names=cols, na_values=" NaN", verbose=True)
#desribes the shape of the CSV file
df.shape
#returns the names of all the newspapers in the csv file
df.newspaper.unique()
#Shortens the names of the newspaper titles by deleting the text after ':'
df['newspaper2']=df['newspaper'].apply(lambda x: str(x).split(':')[0])
#returns the names of all the shortened newspapers in the csv file
df.newspaper2.unique()
#Shows the number of images per title for the enitre period
df['newspaper2'].value_counts()
#Tells notebook to recognizes the dates as Year-Month-Day
df['date'] = pd.to_datetime(df['date'], format='%Y-%m-%d')
df = df.set_index(['date'])
#Plots a bar graph of the number of images per title for the entire period
df['newspaper2'].value_counts().plot(kind='barh')
plt.title("Number of images in different Dutch newspapers, 1893-1906")
plt.savefig('kranten.svg', dpi=400, bbox_inches=None, pad_inches=3.1)
#Adds all the images in all the titles for a period of six months
All_Titles = df['newspaper'].groupby(pd.TimeGrouper('6M')).count()
All_Titles / 2
#Plots a graph for the number of images in all the titles per six months
fig, ax = plt.subplots(figsize=(6,3))
ax.plot(All_Titles.index, All_Titles)
ax.xaxis.set_major_locator(dates.YearLocator(5))
plt.xlabel("Date")
plt.title("Number of images in all titles")
plt.ylabel("Number")
plt.xticks(rotation=45)
plt.savefig('allimages.svg', dpi=300, bbox_inches='tight')
plt.show();
#Divides all the images in the categories 'phot' and 'illustration'
Photo = df['newspaper'][df['photoillustration'] == "photo"].groupby(pd.TimeGrouper('6M')).count()
Illustration = df['newspaper'][df['photoillustration'] == "drawing"].groupby(pd.TimeGrouper('6M')).count()
#Plots a graph for the total number of images, photo's and illustrations in all the titles per six months
fig, ax = plt.subplots(figsize=(6,3))
ax.plot(Photo.index, Photo, label="Photo")
ax.plot(Illustration.index, Illustration, label="Illustration")
ax.plot(All_Titles.index, All_Titles, label="All images")
ax.xaxis.set_major_locator(dates.YearLocator(5))
handles, labels = ax.get_legend_handles_labels()
# reverse the order
ax.legend(handles[::-1], labels[::-1])
# or sort them by labels
import operator
hl = sorted(zip(handles, labels),
key=operator.itemgetter(1))
handles2, labels2 = zip(*hl)
ax.legend(handles2, labels2)
plt.xlabel("Date")
plt.title("Number of all images, photos and illustrations in all titles")
plt.ylabel("Number")
plt.xticks(rotation=45)
plt.savefig('photoillustration.svg', dpi=300, bbox_inches='tight')
plt.show();
#Plots a graph for the number of photos in all the titles per six months
fig, ax = plt.subplots(figsize=(6,3))
ax.plot(Photo.index, Photo, label="Photo")
ax.xaxis.set_major_locator(dates.YearLocator(5))
handles, labels = ax.get_legend_handles_labels()
# reverse the order
ax.legend(handles[::-1], labels[::-1])
# or sort them by labels
import operator
hl = sorted(zip(handles, labels),
key=operator.itemgetter(1))
handles2, labels2 = zip(*hl)
ax.legend(handles2, labels2)
plt.xlabel("Date")
plt.title("Number of photos in all titles")
plt.ylabel("Number")
plt.xticks(rotation=45)
plt.savefig('photos.svg', dpi=300, bbox_inches='tight')
plt.show();
#Divides all the images into the nine different categories + faces
Building = df['newspaper'][df['category'].str.contains('building')==True].groupby(pd.TimeGrouper('6M')).count()
Cartoon = df['newspaper'][df['category'].str.contains('cartoon')==True].groupby(pd.TimeGrouper('6M')).count()
Chess = df['newspaper'][df['category'].str.contains('chess')==True].groupby(pd.TimeGrouper('6M')).count()
Crowds = df['newspaper'][df['category'].str.contains('crowds')==True].groupby(pd.TimeGrouper('6M')).count()
Face = df['newspaper'][df['category'].str.contains('face')==True].groupby(pd.TimeGrouper('6M')).count()
Logo = df['newspaper'][df['category'].str.contains('logo')==True].groupby(pd.TimeGrouper('6M')).count()
Maps = df['newspaper'][df['category'].str.contains('maps')==True].groupby(pd.TimeGrouper('6M')).count()
Sheetmusic = df['newspaper'][df['category'].str.contains('sheetmusic')==True].groupby(pd.TimeGrouper('6M')).count()
Schematics = df['newspaper'][df['category'].str.contains('schematics')==True].groupby(pd.TimeGrouper('6M')).count()
Unknown = df['newspaper'][df['photoillustration'] == "unknown"].groupby(pd.TimeGrouper('6M')).count()
Weather = df['newspaper'][df['category'].str.contains('weather')==True].groupby(pd.TimeGrouper('6M')).count()
#Plots a graph for the total number of images in the nine different categories + faces in all the titles per six months
fig, ax = plt.subplots(figsize=(6,3))
ax.plot(Building.index, Building, label="Building")
ax.plot(Cartoon.index, Cartoon, label="Cartoon")
ax.plot(Chess.index, Chess, label="Chess")
ax.plot(Crowds.index, Crowds, label="Crowds")
ax.plot(Face.index, Face, label="Face")
ax.plot(Logo.index, Logo, label="Logo")
ax.plot(Maps.index, Maps, label="Maps")
ax.plot(Unknown.index, Unknown, label="Unknown")
ax.plot(Weather.index, Weather, label="Weather")
handles, labels = ax.get_legend_handles_labels()
# reverse the order
ax.legend(handles[::-1], labels[::-1])
# or sort them by labels
import operator
hl = sorted(zip(handles, labels),
key=operator.itemgetter(1))
handles2, labels2 = zip(*hl)
ax.legend(handles2, labels2)
ax.xaxis.set_major_locator(dates.YearLocator(5))
plt.xlabel("Date")
plt.title("Number of photos in all titles")
plt.ylabel("Number")
plt.xticks(rotation=45)
plt.savefig('halftones.svg', dpi=300, bbox_inches='tight')
plt.show();
#Plots a graph for the total number of images in the nine different categories + faces in all the titles per six months
fig, ax = plt.subplots(figsize=(6,3))
ax.plot(Building.index, Building, label="Building")
ax.plot(Crowds.index, Crowds, label="Crowds")
ax.plot(Maps.index, Maps, label="Maps")
handles, labels = ax.get_legend_handles_labels()
# reverse the order
ax.legend(handles[::-1], labels[::-1])
# or sort them by labels
import operator
hl = sorted(zip(handles, labels),
key=operator.itemgetter(1))
handles2, labels2 = zip(*hl)
ax.legend(handles2, labels2)
ax.xaxis.set_major_locator(dates.YearLocator(5))
plt.xlabel("Date")
plt.title("Three categories in all titles")
plt.ylabel("Number")
plt.xticks(rotation=45)
plt.savefig('categories.svg', dpi=300, bbox_inches='tight')
plt.show();
#Select one title
Algemeen_Handelsblad = df['newspaper'][df['newspaper'] == "Algemeen Handelsblad"]
#Divides all the images in a single title in 'photo' and 'illustration'
Photo1 = df.query('newspaper=="Algemeen Handelsblad" and photoillustration=="photo"')
Illustration1 = df.query('newspaper=="Algemeen Handelsblad" and photoillustration=="drawing"')
Photo1= Photo1['newspaper'].groupby(pd.TimeGrouper('6M')).count()
Illustration1= Illustration1['newspaper'].groupby(pd.TimeGrouper('6M')).count()
#Plots a graph for the total number of images, photo's and illustrations in all the titles per six months
fig, ax = plt.subplots(figsize=(6,3))
ax.plot(Photo1.index, Photo1, label="Photo")
ax.plot(Illustration1.index, Illustration1, label="Illustration")
handles, labels = ax.get_legend_handles_labels()
# reverse the order
ax.legend(handles[::-1], labels[::-1])
# or sort them by labels
import operator
hl = sorted(zip(handles, labels),
key=operator.itemgetter(1))
handles2, labels2 = zip(*hl)
ax.legend(handles2, labels2)
ax.xaxis.set_major_locator(dates.YearLocator(5))
plt.xlabel("Date")
plt.title("Number of photos in all titles")
plt.ylabel("Number")
plt.xticks(rotation=45)
plt.savefig('halftones.svg', dpi=300, bbox_inches='tight')
plt.show();
#Select one title
Telegraaf = df['newspaper'][df['newspaper'] == "De Telegraaf"]
#Divides all the images in a single title in 'photo' and 'illustration'
Photo1 = df.query('newspaper=="De Telegraaf" and photoillustration=="photo"')
Illustration1 = df.query('newspaper=="De Telegraaf" and photoillustration=="drawing"')
Photo1= Photo1['newspaper'].groupby(pd.TimeGrouper('6M')).count()
Illustration1= Illustration1['newspaper'].groupby(pd.TimeGrouper('6M')).count()
#Plots a graph for the total number of images, photo's and illustrations in all the titles per six months
fig, ax = plt.subplots(figsize=(6,3))
ax.plot(Photo1.index, Photo1, label="Photo")
ax.plot(Illustration1.index, Illustration1, label="Illustration")
handles, labels = ax.get_legend_handles_labels()
# reverse the order
ax.legend(handles[::-1], labels[::-1])
# or sort them by labels
import operator
hl = sorted(zip(handles, labels),
key=operator.itemgetter(1))
handles2, labels2 = zip(*hl)
ax.legend(handles2, labels2)
ax.xaxis.set_major_locator(dates.YearLocator(5))
plt.xlabel("Date")
plt.title("Number of photos in all titles")
plt.ylabel("Number")
plt.xticks(rotation=45)
plt.savefig('halftones.svg', dpi=300, bbox_inches='tight')
plt.show();