parse_raw_data

Parsing raw measurements into tidy format

In [1]:
import numpy as np
import pandas as pd
import re

import bokeh.io
import bokeh.plotting

bokeh.io.output_notebook()
Loading BokehJS ...

We read in the raw measurments made in Avizo on the spiracles, and then get them into a tidy format.

In [2]:
df_raw = pd.read_csv("./spiracle_data.csv")
df_raw = df_raw.melt(id_vars=['species', 'sex', 'mass', 'mm_pix', 'pix_mm'])
df_raw['spiracle'] = [i[0:-2] for i in df_raw['variable'].values]
df_raw['dim'] = [i[-1:] for i in df_raw['variable'].values]
k = 0
df_raw_tidy = pd.DataFrame()
for i, j in df_raw.groupby(['dim',]):
    if k == 0:
        df_raw_tidy = j.copy()
        df_raw_tidy[i] = j['value'].values
    else:
        df_raw_tidy[i] = j['value'].values
    k += 1
df_raw_tidy.drop(columns=['variable', 'value'])
df_raw_tidy['area pix^2'] = np.pi*(df_raw_tidy['H']/2)*(df_raw_tidy['V']/2)
df_raw_tidy['depth'] = df_raw_tidy['D']
df_raw_tidy['area mm^2'] = df_raw_tidy['area pix^2']*(df_raw_tidy['mm_pix']**2)
df_raw_tidy['depth'] = df_raw_tidy['depth']*df_raw_tidy['mm_pix']
df_raw_tidy['log mass'] = np.log(df_raw_tidy['mass'])
df_raw_tidy['log mass'] = np.log(df_raw_tidy['mass'])

lookup = {'Ab_VI': '6', 'Ab_V': '5', 'Ab_IV': '4', 'Ab_III': '3', 'Ab_II': '2', 'Ab_I': '1', 'Meso': 'S', 'Meta': 'T'}
df_raw_tidy['spiracle'] = [lookup[s] for s in df_raw_tidy['spiracle']]

df_raw_tidy.head()
Out[2]:
species sex mass mm_pix pix_mm variable value spiracle dim D H V area pix^2 depth area mm^2 log mass
34 Popilia_japonica F 0.1549 0.011494 87.001914 Ab_VI_D 15.18 6 D 15.18 8.77 8.02 55.241294 0.174479 0.007298 -1.864976
35 Popilia_japonica M 0.1075 0.011790 84.817642 Ab_VI_D 13.14 6 D 13.14 6.68 6.21 32.580515 0.154921 0.004529 -2.230264
36 Cyclocephala_borealis M 0.0969 0.012670 78.926598 Ab_VI_D 21.92 6 D 21.92 4.55 5.34 19.082819 0.277726 0.003063 -2.334076
37 Cyclocephala_borealis F 0.1185 0.011719 85.331513 Ab_VI_D 24.17 6 D 24.17 5.33 5.83 24.405384 0.283248 0.003352 -2.132842
38 Goliathus_goliathus M 16.2800 0.013911 71.884332 Ab_VI_D 180.62 6 D 180.62 43.14 41.85 1417.964913 2.512648 0.274408 2.789937

Some summary information about the measurements in the smallest beetles, demonstrating sufficient resolution to make accurate spiracle measurements.

In [3]:
print('Average pixel count for spiracle width in smallest beetles:  '  + str(round(df_raw_tidy.loc[df_raw_tidy['log mass'] < -1]['H'].mean(), 2)))
print('Average pixel count for spiracle height in smallest beetles: ' + str(round(df_raw_tidy.loc[df_raw_tidy['log mass'] < -1]['V'].mean(), 2)))
print('Average pixel count for spiracle depth in smallest beetles:  '  + str(round(df_raw_tidy.loc[df_raw_tidy['log mass'] < -1]['D'].mean(), 2)))
Average pixel count for spiracle width in smallest beetles:  8.84
Average pixel count for spiracle height in smallest beetles: 10.92
Average pixel count for spiracle depth in smallest beetles:  16.93

We will read in the other pre-formated tidy data table to confirm that it matches what we have calculated here.

In [4]:
df = pd.read_csv("./20190322_supp_table_2.csv")

First we will check that all the species names match us.

In [5]:
print(np.sort(df_raw_tidy['species'].unique()))
print(np.sort(np.unique(np.array([re.sub(' ', '_', s) for s in df['species'].values]))))
print(np.sort(df_raw_tidy['species'].unique()) == np.sort(np.unique(np.array([re.sub(' ', '_', s) for s in df['species'].values]))))
['Coelorrhina_hornimani' 'Cyclocephala_borealis' 'Dicronorrhina_derbyana'
 'Dynastes_hercules' 'Eudicella_euthalia' 'Goliathus_goliathus'
 'Mecynorrhina_torquata' 'Popilia_japonica' 'Protaetia_orientalis'
 'Trypoxylus_dichotomus']
['Coelorrhina_hornimani' 'Cyclocephala_borealis' 'Dicronorrhina_derbyana'
 'Dynastes_hercules' 'Eudicella_euthalia' 'Goliathus_goliathus'
 'Mecynorrhina_torquata' 'Popilia_japonica' 'Protaetia_orientalis'
 'Trypoxylus_dichotomus']
[ True  True  True  True  True  True  True  True  True  True]

Looks good! We will plot all the spiracle areas vs mass and depths vs mass on top of each other from both data tables to confirm that they match one another.

In [6]:
p = bokeh.plotting.figure(plot_height=350, title='log(area) vs log(mass)')
p.circle(np.log10(df_raw_tidy['mass']), np.log10(df_raw_tidy['area mm^2']), size=20)
p.circle(np.log10(df['mass (g)']), np.log10(df['area (mm^2)']), size=10, color='black')
bokeh.io.show(p)

p = bokeh.plotting.figure(plot_height=350, title='log(depth) vs log(mass)')
p.circle(np.log10(df_raw_tidy['mass']), np.log10(df_raw_tidy['depth']), size=20)
p.circle(np.log10(df['mass (g)']), np.log10(df['depth (mm)']), size=10, color='black')
bokeh.io.show(p)

Looks identical! We can make a similar plot with each individual spiracle to confirm the results are the same.

In [7]:
for s in df['spiracle'].unique():
    p = bokeh.plotting.figure(plot_height=350, title=s + ' log(area) vs log(mass)')
    p.circle(np.log10(df_raw_tidy.loc[df_raw_tidy['spiracle'] == s, 'mass']), np.log10(df_raw_tidy.loc[df_raw_tidy['spiracle'] == s, 'area mm^2']), size=20)
    p.circle(np.log10(df.loc[df['spiracle'] == s, 'mass (g)']), np.log10(df.loc[df['spiracle'] == s, 'area (mm^2)']), size=10, color='black')
    bokeh.io.show(p)

    p = bokeh.plotting.figure(plot_height=350, title=s + ' log(depth) vs log(mass)')
    p.circle(np.log10(df_raw_tidy.loc[df_raw_tidy['spiracle'] == s, 'mass']), np.log10(df_raw_tidy.loc[df_raw_tidy['spiracle'] == s, 'depth']), size=20)
    p.circle(np.log10(df.loc[df['spiracle'] == s, 'mass (g)']), np.log10(df.loc[df['spiracle'] == s, 'depth (mm)']), size=10, color='black')
    bokeh.io.show(p)

We do the same for the species averaged values.

In [8]:
df_av1 = df_raw_tidy.dropna().groupby(['species', 'spiracle'], as_index=False).aggregate(np.average)
df_av2 = df.groupby(['species', 'spiracle'], as_index=False).aggregate(np.average)

for s in df['spiracle'].unique():
    p = bokeh.plotting.figure(plot_height=350, title=s + ' log(area) vs log(mass)')
    p.circle(np.log10(df_av1.loc[df_av1['spiracle'] == s, 'mass']), np.log10(df_av1.loc[df_av1['spiracle'] == s, 'area mm^2']), size=20)
    p.circle(np.log10(df_av2.loc[df_av2['spiracle'] == s, 'mass (g)']), np.log10(df_av2.loc[df_av2['spiracle'] == s, 'area (mm^2)']), size=10, color='black')
    bokeh.io.show(p)

    p = bokeh.plotting.figure(plot_height=350, title=s + ' log(depth) vs log(mass)')
    p.circle(np.log10(df_av1.loc[df_av1['spiracle'] == s, 'mass']), np.log10(df_av1.loc[df_av1['spiracle'] == s, 'depth']), size=20)
    p.circle(np.log10(df_av2.loc[df_av2['spiracle'] == s, 'mass (g)']), np.log10(df_av2.loc[df_av2['spiracle'] == s, 'depth (mm)']), size=10, color='black')
    bokeh.io.show(p)
In [ ]: