Parse raw data
Parsing raw measurements into tidy format¶
In [1]:
import numpy as np
import pandas as pd
import re
import bokeh.io
import bokeh.plotting
bokeh.io.output_notebook()
We read in the raw measurments made in Avizo on the spiracles, and then get them into a tidy format.
In [2]:
df_raw = pd.read_csv("./spiracle_data.csv")
df_raw = df_raw.melt(id_vars=['species', 'sex', 'mass', 'mm_pix', 'pix_mm'])
df_raw['spiracle'] = [i[0:-2] for i in df_raw['variable'].values]
df_raw['dim'] = [i[-1:] for i in df_raw['variable'].values]
k = 0
df_raw_tidy = pd.DataFrame()
for i, j in df_raw.groupby(['dim',]):
if k == 0:
df_raw_tidy = j.copy()
df_raw_tidy[i] = j['value'].values
else:
df_raw_tidy[i] = j['value'].values
k += 1
df_raw_tidy.drop(columns=['variable', 'value'])
df_raw_tidy['area pix^2'] = np.pi*(df_raw_tidy['H']/2)*(df_raw_tidy['V']/2)
df_raw_tidy['depth'] = df_raw_tidy['D']
df_raw_tidy['area mm^2'] = df_raw_tidy['area pix^2']*(df_raw_tidy['mm_pix']**2)
df_raw_tidy['depth'] = df_raw_tidy['depth']*df_raw_tidy['mm_pix']
df_raw_tidy['log mass'] = np.log(df_raw_tidy['mass'])
df_raw_tidy['log mass'] = np.log(df_raw_tidy['mass'])
lookup = {'Ab_VI': '6', 'Ab_V': '5', 'Ab_IV': '4', 'Ab_III': '3', 'Ab_II': '2', 'Ab_I': '1', 'Meso': 'S', 'Meta': 'T'}
df_raw_tidy['spiracle'] = [lookup[s] for s in df_raw_tidy['spiracle']]
df_raw_tidy.head()
Out[2]:
Some summary information about the measurements in the smallest beetles, demonstrating sufficient resolution to make accurate spiracle measurements.
In [3]:
print('Average pixel count for spiracle width in smallest beetles: ' + str(round(df_raw_tidy.loc[df_raw_tidy['log mass'] < -1]['H'].mean(), 2)))
print('Average pixel count for spiracle height in smallest beetles: ' + str(round(df_raw_tidy.loc[df_raw_tidy['log mass'] < -1]['V'].mean(), 2)))
print('Average pixel count for spiracle depth in smallest beetles: ' + str(round(df_raw_tidy.loc[df_raw_tidy['log mass'] < -1]['D'].mean(), 2)))
We will read in the other pre-formated tidy data table to confirm that it matches what we have calculated here.
In [4]:
df = pd.read_csv("./20190322_supp_table_2.csv")
First we will check that all the species names match us.
In [5]:
print(np.sort(df_raw_tidy['species'].unique()))
print(np.sort(np.unique(np.array([re.sub(' ', '_', s) for s in df['species'].values]))))
print(np.sort(df_raw_tidy['species'].unique()) == np.sort(np.unique(np.array([re.sub(' ', '_', s) for s in df['species'].values]))))
Looks good! We will plot all the spiracle areas vs mass and depths vs mass on top of each other from both data tables to confirm that they match one another.
In [6]:
p = bokeh.plotting.figure(plot_height=350, title='log(area) vs log(mass)')
p.circle(np.log10(df_raw_tidy['mass']), np.log10(df_raw_tidy['area mm^2']), size=20)
p.circle(np.log10(df['mass (g)']), np.log10(df['area (mm^2)']), size=10, color='black')
bokeh.io.show(p)
p = bokeh.plotting.figure(plot_height=350, title='log(depth) vs log(mass)')
p.circle(np.log10(df_raw_tidy['mass']), np.log10(df_raw_tidy['depth']), size=20)
p.circle(np.log10(df['mass (g)']), np.log10(df['depth (mm)']), size=10, color='black')
bokeh.io.show(p)
Looks identical! We can make a similar plot with each individual spiracle to confirm the results are the same.
In [7]:
for s in df['spiracle'].unique():
p = bokeh.plotting.figure(plot_height=350, title=s + ' log(area) vs log(mass)')
p.circle(np.log10(df_raw_tidy.loc[df_raw_tidy['spiracle'] == s, 'mass']), np.log10(df_raw_tidy.loc[df_raw_tidy['spiracle'] == s, 'area mm^2']), size=20)
p.circle(np.log10(df.loc[df['spiracle'] == s, 'mass (g)']), np.log10(df.loc[df['spiracle'] == s, 'area (mm^2)']), size=10, color='black')
bokeh.io.show(p)
p = bokeh.plotting.figure(plot_height=350, title=s + ' log(depth) vs log(mass)')
p.circle(np.log10(df_raw_tidy.loc[df_raw_tidy['spiracle'] == s, 'mass']), np.log10(df_raw_tidy.loc[df_raw_tidy['spiracle'] == s, 'depth']), size=20)
p.circle(np.log10(df.loc[df['spiracle'] == s, 'mass (g)']), np.log10(df.loc[df['spiracle'] == s, 'depth (mm)']), size=10, color='black')
bokeh.io.show(p)
We do the same for the species averaged values.
In [8]:
df_av1 = df_raw_tidy.dropna().groupby(['species', 'spiracle'], as_index=False).aggregate(np.average)
df_av2 = df.groupby(['species', 'spiracle'], as_index=False).aggregate(np.average)
for s in df['spiracle'].unique():
p = bokeh.plotting.figure(plot_height=350, title=s + ' log(area) vs log(mass)')
p.circle(np.log10(df_av1.loc[df_av1['spiracle'] == s, 'mass']), np.log10(df_av1.loc[df_av1['spiracle'] == s, 'area mm^2']), size=20)
p.circle(np.log10(df_av2.loc[df_av2['spiracle'] == s, 'mass (g)']), np.log10(df_av2.loc[df_av2['spiracle'] == s, 'area (mm^2)']), size=10, color='black')
bokeh.io.show(p)
p = bokeh.plotting.figure(plot_height=350, title=s + ' log(depth) vs log(mass)')
p.circle(np.log10(df_av1.loc[df_av1['spiracle'] == s, 'mass']), np.log10(df_av1.loc[df_av1['spiracle'] == s, 'depth']), size=20)
p.circle(np.log10(df_av2.loc[df_av2['spiracle'] == s, 'mass (g)']), np.log10(df_av2.loc[df_av2['spiracle'] == s, 'depth (mm)']), size=10, color='black')
bokeh.io.show(p)
In [ ]: