In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as dates
%matplotlib inline
In [2]:
#Loads the CSV file (make sure that it is in the same folder as the notebook)
cols = ["id", "type", "newspaper", "date", "url", "category", "photoillustration"]

df = pd.read_csv("merged.csv", sep=';', names=cols, na_values=" NaN", verbose=True)
Tokenization took: 299.80 ms
Type conversion took: 263.38 ms
Parser memory cleanup took: 0.02 ms
Tokenization took: 115.11 ms
Type conversion took: 119.20 ms
Parser memory cleanup took: 0.01 ms
In [3]:
#desribes the shape of the CSV file
df.shape 
Out[3]:
(192241, 7)
In [4]:
#returns the names of all the newspapers in the csv file
df.newspaper.unique()
Out[4]:
array(['Algemeen Handelsblad',
       'Provinciale Overijsselsche en Zwolsche courant : staats-, handels-, nieuws- en advertentieblad',
       'Nieuw Amsterdamsch handels- en effectenblad',
       'Utrechtsche provinciale en stads-courant : algemeen advertentieblad',
       'Provinciale Drentsche en Asser courant', 'Leeuwarder courant',
       'Rotterdamsche courant', "Dagblad van Zuidholland en 's Gravenhage",
       'Opregte Haarlemsche Courant',
       'Nieuwe Rotterdamsche courant : staats-, handels-, nieuws- en advertentieblad',
       'Leydse courant', 'Middelburgsche courant', 'Surinaamsch weekblad',
       "Provinciale Noordbrabantsche en 's Hertogenbossche courant",
       'De Curaçaosche courant',
       'Delftsche courant : nieuwsblad voor Delft en Delfland',
       'Het Amsterdamsch handels- en effectenblad',
       'De West-Indiër : dagblad toegewijd aan de belangen van Nederlandsch Guyana',
       'Nederlandsche staatscourant',
       'Utrechtsch provinciaal en stedelijk dagblad : algemeen advertentie-blad',
       'De Curaçaosche courant',
       'De Noord-Brabanter : staat- en letterkundig dagblad',
       'De Tijd : godsdienstig-staatkundig dagblad',
       'De locomotief : Samarangsch handels- en advertentie-blad',
       'Sumatra-courant : nieuws- en advertentieblad', 'Bredasche courant',
       'Bataviaasch handelsblad',
       'De kolonist : dagblad toegewyd aan de belangen van Suriname',
       'Nieuw Israelietisch weekblad', 'Koloniaal nieuwsblad',
       'Venloosch weekblad',
       'Surinaamsche courant en Gouvernements advertentie blad',
       'Java-bode : nieuws, handels- en advertentieblad voor Nederlandsch-Indie',
       'Arnhemsche courant',
       'Provinciale Overijsselsche en Zwolsche courant',
       'Apeldoornsche courant', 'Nieuwe Veendammer courant',
       'Suriname : koloniaal nieuws- en advertentieblad', 'De grondwet',
       'Het nieuws van den dag : kleine courant', 'Veendammer courant',
       'De maasbode', 'Tubantia', 'De standaard',
       'De Gooi- en Eemlander : nieuws- en advertentieblad',
       'Tilburgsche courant', 'Rotterdamsch nieuwsblad',
       'Soerabaijasch handelsblad',
       "De Graafschap-bode : nieuws- en advertentieblad voor stad- en ambt-Doetinchem, Hummelo en Keppel, Wehl, Zeddam, 's Heerenberg, Ulft, Gendringen, Sillevolde, Terborg, Varsseveld, Dinxperlo, Aalten, Breedevoorde, Lichtenvoorde, Groenlo, Neede, Eibergen, Bor",
       'De Volksvriend', 'Helmondsche courant', 'De Zuid-Willemsvaart',
       'De Amsterdammer : dagblad voor Nederland',
       'Amigoe di Curacao : weekblad voor de Curacaosche eilanden',
       'Haagsche courant', 'Nieuwe Tilburgsche Courant',
       'Bataviaasch nieuwsblad', 'Recht voor allen', 'Venloosche courant',
       'De Tijd : godsdienstig-staatkundig dagblad', 'De Volksstem',
       'Nieuwsblad van het Noorden',
       'Leeuwarder courant : hoofdblad van Friesland', 'De Telegraaf',
       'Nieuwe Surinaamsche courant', 'De Peel- en Kempenbode',
       'De Surinamer : nieuws- en advertentieblad',
       'Samarangsch advertentie-blad', 'Venloosch nieuwsblad',
       'De Preanger-bode', 'De Sumatra post',
       'Het nieuws van den dag voor Nederlandsch-Indië', nan,
       'Het volk : dagblad voor de arbeiderspartij',
       "Nieuwsblad van Friesland : Hepkema's courant",
       'Verzameling van verslagen en rapporten behoorende bij de Nederlandsche Staatscourant',
       'Provinciale Geldersche en Nijmeegsche courant',
       'Nieuwe Venlosche courant', 'Het Centrum',
       'Nieuwe Rotterdamsche Courant',
       'Het volk : dagblad voor de arbeiderspartij',
       'De West : nieuwsblad uit en voor Suriname',
       'De tribune : soc. dem. weekblad'], dtype=object)
In [5]:
#Shortens the names of the newspaper titles by deleting the text after ':'
df['newspaper2']=df['newspaper'].apply(lambda x: str(x).split(':')[0]) 
In [6]:
#returns the names of all the shortened newspapers in the csv file
df.newspaper2.unique()
Out[6]:
array(['Algemeen Handelsblad',
       'Provinciale Overijsselsche en Zwolsche courant ',
       'Nieuw Amsterdamsch handels- en effectenblad',
       'Utrechtsche provinciale en stads-courant ',
       'Provinciale Drentsche en Asser courant', 'Leeuwarder courant',
       'Rotterdamsche courant', "Dagblad van Zuidholland en 's Gravenhage",
       'Opregte Haarlemsche Courant', 'Nieuwe Rotterdamsche courant ',
       'Leydse courant', 'Middelburgsche courant', 'Surinaamsch weekblad',
       "Provinciale Noordbrabantsche en 's Hertogenbossche courant",
       'De Curaçaosche courant', 'Delftsche courant ',
       'Het Amsterdamsch handels- en effectenblad', 'De West-Indiër ',
       'Nederlandsche staatscourant',
       'Utrechtsch provinciaal en stedelijk dagblad ',
       'De Curaçaosche courant', 'De Noord-Brabanter ', 'De Tijd ',
       'De locomotief ', 'Sumatra-courant ', 'Bredasche courant',
       'Bataviaasch handelsblad', 'De kolonist ',
       'Nieuw Israelietisch weekblad', 'Koloniaal nieuwsblad',
       'Venloosch weekblad',
       'Surinaamsche courant en Gouvernements advertentie blad',
       'Java-bode ', 'Arnhemsche courant',
       'Provinciale Overijsselsche en Zwolsche courant',
       'Apeldoornsche courant', 'Nieuwe Veendammer courant', 'Suriname ',
       'De grondwet', 'Het nieuws van den dag ', 'Veendammer courant',
       'De maasbode', 'Tubantia', 'De standaard', 'De Gooi- en Eemlander ',
       'Tilburgsche courant', 'Rotterdamsch nieuwsblad',
       'Soerabaijasch handelsblad', 'De Graafschap-bode ',
       'De Volksvriend', 'Helmondsche courant', 'De Zuid-Willemsvaart',
       'De Amsterdammer ', 'Amigoe di Curacao ', 'Haagsche courant',
       'Nieuwe Tilburgsche Courant', 'Bataviaasch nieuwsblad',
       'Recht voor allen', 'Venloosche courant', 'De Tijd ',
       'De Volksstem', 'Nieuwsblad van het Noorden', 'Leeuwarder courant ',
       'De Telegraaf', 'Nieuwe Surinaamsche courant',
       'De Peel- en Kempenbode', 'De Surinamer ',
       'Samarangsch advertentie-blad', 'Venloosch nieuwsblad',
       'De Preanger-bode', 'De Sumatra post',
       'Het nieuws van den dag voor Nederlandsch-Indië', 'nan',
       'Het volk ', 'Nieuwsblad van Friesland ',
       'Verzameling van verslagen en rapporten behoorende bij de Nederlandsche Staatscourant',
       'Provinciale Geldersche en Nijmeegsche courant',
       'Nieuwe Venlosche courant', 'Het Centrum',
       'Nieuwe Rotterdamsche Courant', 'De West ', 'De tribune '], dtype=object)
In [7]:
#Shows the number of images per title for the enitre period 
df['newspaper2'].value_counts()
Out[7]:
Rotterdamsch nieuwsblad                            39835
Haagsche courant                                   27813
De Telegraaf                                       22870
Algemeen Handelsblad                               19850
Het nieuws van den dag                             11063
Nederlandsche staatscourant                        10811
Nieuwsblad van het Noorden                          7919
Nieuwe Tilburgsche Courant                          6008
Tilburgsche courant                                 5969
Nieuwsblad van Friesland                            4644
Nieuwe Rotterdamsche Courant                        4643
De grondwet                                         4433
Het volk                                            3248
Het nieuws van den dag voor Nederlandsch-Indië     2800
De Tijd                                              2758
De Graafschap-bode                                  2728
De maasbode                                         1596
Leeuwarder courant                                  1515
De Gooi- en Eemlander                               1393
Bataviaasch nieuwsblad                               966
De Volksvriend                                       848
De Volksstem                                         810
De Preanger-bode                                     635
Soerabaijasch handelsblad                            620
De locomotief                                        594
Dagblad van Zuidholland en 's Gravenhage             562
Provinciale Overijsselsche en Zwolsche courant       545
Bataviaasch handelsblad                              541
Het Centrum                                          414
De Sumatra post                                      401
                                                   ...  
Venloosch weekblad                                    48
Java-bode                                             39
Provinciale Geldersche en Nijmeegsche courant         33
Tubantia                                              30
Sumatra-courant                                       30
Rotterdamsche courant                                 29
De Tijd                                               28
Nieuwe Rotterdamsche courant                          23
Opregte Haarlemsche Courant                           20
De Amsterdammer                                       19
Provinciale Overijsselsche en Zwolsche courant         18
Nieuwe Veendammer courant                             17
Venloosch nieuwsblad                                  16
De standaard                                          16
Utrechtsch provinciaal en stedelijk dagblad            13
Surinaamsch weekblad                                  10
Nieuwe Venlosche courant                               9
Helmondsche courant                                    8
De Noord-Brabanter                                     7
Veendammer courant                                     7
De Curaçaosche courant                                6
De Curaçaosche courant                                 5
Bredasche courant                                      4
Leeuwarder courant                                     2
Leydse courant                                         2
De tribune                                             2
De West                                                2
Utrechtsche provinciale en stads-courant               1
Het Amsterdamsch handels- en effectenblad              1
Samarangsch advertentie-blad                           1
Name: newspaper2, Length: 82, dtype: int64
In [8]:
#Tells notebook to recognizes the dates as Year-Month-Day
df['date'] = pd.to_datetime(df['date'], format='%Y-%m-%d')
df = df.set_index(['date'])
In [9]:
#Plots a bar graph of the number of images per title for the entire period
df['newspaper2'].value_counts().plot(kind='barh') 
plt.title("Number of images in different Dutch newspapers, 1893-1906")
plt.savefig('kranten.svg', dpi=400, bbox_inches=None, pad_inches=3.1)
In [15]:
#Adds all the images in all the titles for a period of six months
All_Titles = df['newspaper'].groupby(pd.TimeGrouper('6M')).count()
In [16]:
All_Titles / 2
Out[16]:
date
1860-01-31       4.0
1860-07-31      34.0
1861-01-31      25.5
1861-07-31      35.0
1862-01-31      95.5
1862-07-31     138.0
1863-01-31     145.0
1863-07-31      21.0
1864-01-31      28.0
1864-07-31      20.0
1865-01-31      29.5
1865-07-31      27.0
1866-01-31      21.5
1866-07-31      22.5
1867-01-31      51.0
1867-07-31      58.5
1868-01-31      70.0
1868-07-31      37.0
1869-01-31      61.5
1869-07-31      60.5
1870-01-31      58.0
1870-07-31      46.0
1871-01-31      57.0
1871-07-31      76.5
1872-01-31      70.5
1872-07-31      87.5
1873-01-31      67.0
1873-07-31      76.5
1874-01-31      10.5
1874-07-31      14.0
               ...  
1902-07-31    1246.5
1903-01-31    1489.0
1903-07-31    1356.5
1904-01-31    1932.5
1904-07-31    2674.0
1905-01-31    2138.0
1905-07-31    2170.5
1906-01-31    1406.5
1906-07-31    1719.0
1907-01-31    1741.5
1907-07-31    2543.0
1908-01-31    2627.0
1908-07-31    2551.5
1909-01-31    2317.5
1909-07-31    2477.0
1910-01-31    2503.5
1910-07-31    2330.0
1911-01-31    2545.0
1911-07-31    2919.0
1912-01-31    3044.5
1912-07-31    3150.5
1913-01-31    3443.5
1913-07-31    3629.5
1914-01-31    3168.5
1914-07-31     158.5
1915-01-31     590.0
1915-07-31    3081.0
1916-01-31    2667.5
1916-07-31      73.5
1917-01-31      59.0
Name: newspaper, Length: 115, dtype: float64
In [11]:
#Plots a graph for the number of images in all the titles per six months 
fig, ax = plt.subplots(figsize=(6,3))
ax.plot(All_Titles.index, All_Titles)
ax.xaxis.set_major_locator(dates.YearLocator(5))
plt.xlabel("Date")
plt.title("Number of images in all titles")
plt.ylabel("Number")
plt.xticks(rotation=45)
plt.savefig('allimages.svg', dpi=300, bbox_inches='tight')
plt.show();
In [30]:
#Divides all the images in the categories 'phot' and 'illustration'
Photo = df['newspaper'][df['photoillustration'] == "photo"].groupby(pd.TimeGrouper('6M')).count() 
Illustration = df['newspaper'][df['photoillustration'] == "drawing"].groupby(pd.TimeGrouper('6M')).count()
In [31]:
#Plots a graph for the total number of images, photo's and illustrations in all the titles per six months 
fig, ax = plt.subplots(figsize=(6,3))
ax.plot(Photo.index, Photo, label="Photo")
ax.plot(Illustration.index, Illustration, label="Illustration")
ax.plot(All_Titles.index, All_Titles, label="All images")
ax.xaxis.set_major_locator(dates.YearLocator(5))
handles, labels = ax.get_legend_handles_labels()

# reverse the order
ax.legend(handles[::-1], labels[::-1])

# or sort them by labels
import operator
hl = sorted(zip(handles, labels),
            key=operator.itemgetter(1))
handles2, labels2 = zip(*hl)

ax.legend(handles2, labels2)
plt.xlabel("Date")
plt.title("Number of all images, photos and illustrations in all titles")
plt.ylabel("Number")
plt.xticks(rotation=45)
plt.savefig('photoillustration.svg', dpi=300, bbox_inches='tight')
plt.show();
In [32]:
#Plots a graph for the number of photos in all the titles per six months 
fig, ax = plt.subplots(figsize=(6,3))
ax.plot(Photo.index, Photo, label="Photo")
ax.xaxis.set_major_locator(dates.YearLocator(5))
handles, labels = ax.get_legend_handles_labels()

# reverse the order
ax.legend(handles[::-1], labels[::-1])

# or sort them by labels
import operator
hl = sorted(zip(handles, labels),
            key=operator.itemgetter(1))
handles2, labels2 = zip(*hl)

ax.legend(handles2, labels2)
plt.xlabel("Date")
plt.title("Number of photos in all titles")
plt.ylabel("Number")
plt.xticks(rotation=45)
plt.savefig('photos.svg', dpi=300, bbox_inches='tight')
plt.show();
In [33]:
#Divides all the images into the nine different categories + faces
Building = df['newspaper'][df['category'].str.contains('building')==True].groupby(pd.TimeGrouper('6M')).count() 
Cartoon = df['newspaper'][df['category'].str.contains('cartoon')==True].groupby(pd.TimeGrouper('6M')).count()
Chess = df['newspaper'][df['category'].str.contains('chess')==True].groupby(pd.TimeGrouper('6M')).count()
Crowds = df['newspaper'][df['category'].str.contains('crowds')==True].groupby(pd.TimeGrouper('6M')).count()
Face = df['newspaper'][df['category'].str.contains('face')==True].groupby(pd.TimeGrouper('6M')).count()
Logo = df['newspaper'][df['category'].str.contains('logo')==True].groupby(pd.TimeGrouper('6M')).count()
Maps = df['newspaper'][df['category'].str.contains('maps')==True].groupby(pd.TimeGrouper('6M')).count()
Sheetmusic = df['newspaper'][df['category'].str.contains('sheetmusic')==True].groupby(pd.TimeGrouper('6M')).count()
Schematics = df['newspaper'][df['category'].str.contains('schematics')==True].groupby(pd.TimeGrouper('6M')).count()
Unknown = df['newspaper'][df['photoillustration'] == "unknown"].groupby(pd.TimeGrouper('6M')).count()
Weather = df['newspaper'][df['category'].str.contains('weather')==True].groupby(pd.TimeGrouper('6M')).count()
In [34]:
#Plots a graph for the total number of images in the nine different categories + faces in all the titles per six months 
fig, ax = plt.subplots(figsize=(6,3)) 
ax.plot(Building.index, Building, label="Building")
ax.plot(Cartoon.index, Cartoon, label="Cartoon")
ax.plot(Chess.index, Chess, label="Chess")
ax.plot(Crowds.index, Crowds, label="Crowds")
ax.plot(Face.index, Face, label="Face")
ax.plot(Logo.index, Logo, label="Logo")
ax.plot(Maps.index, Maps, label="Maps")
ax.plot(Unknown.index, Unknown, label="Unknown")
ax.plot(Weather.index, Weather, label="Weather")
handles, labels = ax.get_legend_handles_labels()

# reverse the order
ax.legend(handles[::-1], labels[::-1])

# or sort them by labels
import operator
hl = sorted(zip(handles, labels),
            key=operator.itemgetter(1))
handles2, labels2 = zip(*hl)

ax.legend(handles2, labels2)
ax.xaxis.set_major_locator(dates.YearLocator(5))
plt.xlabel("Date")
plt.title("Number of photos in all titles")
plt.ylabel("Number")
plt.xticks(rotation=45)
plt.savefig('halftones.svg', dpi=300, bbox_inches='tight')
plt.show();
In [35]:
#Plots a graph for the total number of images in the nine different categories + faces in all the titles per six months 
fig, ax = plt.subplots(figsize=(6,3)) 
ax.plot(Building.index, Building, label="Building")
ax.plot(Crowds.index, Crowds, label="Crowds")
ax.plot(Maps.index, Maps, label="Maps")
handles, labels = ax.get_legend_handles_labels()

# reverse the order
ax.legend(handles[::-1], labels[::-1])

# or sort them by labels
import operator
hl = sorted(zip(handles, labels),
            key=operator.itemgetter(1))
handles2, labels2 = zip(*hl)

ax.legend(handles2, labels2)
ax.xaxis.set_major_locator(dates.YearLocator(5))
plt.xlabel("Date")
plt.title("Three categories in all titles")
plt.ylabel("Number")
plt.xticks(rotation=45)
plt.savefig('categories.svg', dpi=300, bbox_inches='tight')
plt.show();
In [36]:
#Select one title 
Algemeen_Handelsblad = df['newspaper'][df['newspaper'] == "Algemeen Handelsblad"]
In [37]:
#Divides all the images in a single title in 'photo' and 'illustration'
Photo1 = df.query('newspaper=="Algemeen Handelsblad" and photoillustration=="photo"')
Illustration1 = df.query('newspaper=="Algemeen Handelsblad" and photoillustration=="drawing"')
Photo1= Photo1['newspaper'].groupby(pd.TimeGrouper('6M')).count()
Illustration1= Illustration1['newspaper'].groupby(pd.TimeGrouper('6M')).count()
In [38]:
#Plots a graph for the total number of images, photo's and illustrations in all the titles per six months 
fig, ax = plt.subplots(figsize=(6,3)) 
ax.plot(Photo1.index, Photo1, label="Photo")
ax.plot(Illustration1.index, Illustration1, label="Illustration")
handles, labels = ax.get_legend_handles_labels()

# reverse the order
ax.legend(handles[::-1], labels[::-1])

# or sort them by labels
import operator
hl = sorted(zip(handles, labels),
            key=operator.itemgetter(1))
handles2, labels2 = zip(*hl)

ax.legend(handles2, labels2)
ax.xaxis.set_major_locator(dates.YearLocator(5))
plt.xlabel("Date")
plt.title("Number of photos in all titles")
plt.ylabel("Number")
plt.xticks(rotation=45)
plt.savefig('halftones.svg', dpi=300, bbox_inches='tight')
plt.show();
In [39]:
#Select one title 
Telegraaf = df['newspaper'][df['newspaper'] == "De Telegraaf"]
In [40]:
#Divides all the images in a single title in 'photo' and 'illustration'
Photo1 = df.query('newspaper=="De Telegraaf" and photoillustration=="photo"')
Illustration1 = df.query('newspaper=="De Telegraaf" and photoillustration=="drawing"')
Photo1= Photo1['newspaper'].groupby(pd.TimeGrouper('6M')).count()
Illustration1= Illustration1['newspaper'].groupby(pd.TimeGrouper('6M')).count()
In [41]:
#Plots a graph for the total number of images, photo's and illustrations in all the titles per six months 
fig, ax = plt.subplots(figsize=(6,3)) 
ax.plot(Photo1.index, Photo1, label="Photo")
ax.plot(Illustration1.index, Illustration1, label="Illustration")
handles, labels = ax.get_legend_handles_labels()

# reverse the order
ax.legend(handles[::-1], labels[::-1])

# or sort them by labels
import operator
hl = sorted(zip(handles, labels),
            key=operator.itemgetter(1))
handles2, labels2 = zip(*hl)

ax.legend(handles2, labels2)
ax.xaxis.set_major_locator(dates.YearLocator(5))
plt.xlabel("Date")
plt.title("Number of photos in all titles")
plt.ylabel("Number")
plt.xticks(rotation=45)
plt.savefig('halftones.svg', dpi=300, bbox_inches='tight')
plt.show();