%matplotlib notebook
from scipy.io import loadmat
import numpy as np
import pandas as pd
from copy import copy
import matplotlib.pyplot as plt
import matplotlib.colors as colors
import itertools
from sklearn.metrics import roc_curve,roc_auc_score
from sklearn.linear_model import LinearRegression


dat = loadmat('../data/corr/wbdata/TS20141221b_THK178_lite-1_punc-31_NLS3_6eggs_1mMTet_basal_1080s.mat')


# load and clean data
dfof = dat['wbData']['deltaFOverF'][0][0]

# ids are in an atrocious state, just want a damn list sometimes yano
ids = dat['wbData']['NeuronIds'][0][0][0]
id_list = []
for id in ids:
    sub_id = id[0].tolist()
    try:
        subsub_id = sub_id[0].tolist()
        id_list.append(subsub_id)
    except IndexError:
        id_list.append([""])
        
ids = np.array([subitem for sublist in id_list for subitem in sublist])


id_idx = np.where(ids)[0]
ids = ids[id_idx]
dfof = dfof[:,id_idx]

# rearrange in alphabetical order so our correlation matrices match up
dfof = dfof[:,np.argsort(ids)]


# tada!
dfof_corr = np.corrcoef(dfof.T)


# load the connectome
connect = pd.read_excel('../data/corr/CElegansNeuronTables.xls')

# subset only the cells we have in the Ca data
good_cells = np.logical_and(connect['Origin'].isin(ids), connect['Target'].isin(ids))
connect = connect.loc[good_cells,:]

# a weight column
connect['w'] = np.zeros(connect.shape[0])


# make sure we have all the same cells 
pair_list = list(itertools.permutations(sorted(ids),2))
for pair in pair_list:
    # if the pair isn't in the dframe...
    if connect[np.logical_and(connect['Origin'] == pair[0], connect['Target'] == pair[1])].shape[0] == 0:
        # add an 'unconnected row'
        connect = connect.append({'Origin': pair[0],
                                  'Target': pair[1],
                                  'Type': 'Unconnected',
                                  'Number of Connections':0,
                                  'Neurotransmitter': "None",
                                  'w':0}, ignore_index=True)
        
# sort this too
connect.sort_values(by=['Origin', 'Target'], axis=0, inplace=True)


naive = copy(connect)

# Set all the points that aren't "unconnected" to 1
naive.loc[naive['Type'] != "Unconnected",'w'] = 1.0

# multiply by the number of connections to get a very rough estimate of connection strength.
naive.loc[:,'w'] = naive.loc[:,'w']*naive.loc[:,'Number of Connections']


# collapse duplicates - some synapses are listed twice because they have both gap junctions and chemical synapses.
naive_sum = naive.groupby(["Origin", "Target"])['w'].sum()
naive_sum = naive_sum.reset_index()

# rescale to 0-1
naive_sum.loc[:,'w'] = naive_sum.w/(naive_sum.w.max())

# pivot to an adjacency matrix
# pivoting automatically sorts by 
naive_adj = naive_sum.pivot(index="Origin", columns="Target", values='w')
naive_adj.fillna(value=0, inplace=True)


fig, ax = plt.subplots(1,2)
ax[0].imshow(np.abs(dfof_corr))
ax[1].imshow(np.sqrt(naive_adj.values))

ax[0].set_title('Calcium signal correlation')
ax[1].set_title('Connectivity Matrix')

Text(0.5,1,'Connectivity Matrix')


fig, ax = plt.subplots()

# use the square root because there are very few connections with the highest strength and most are zero...
# hey gotta transform somehow.
sqrt_connectivity = np.sqrt(naive_adj.values)

# do a lil linear regression
model = LinearRegression()
model.fit(sqrt_connectivity.reshape(-1,1), np.abs(dfof_corr).reshape(-1,1))

# make predicted points
x_new = np.linspace(0, 1, 100)
y_new = model.predict(x_new[:, np.newaxis])



# also log scale the fill because most points are unconnected.
ax.hexbin(sqrt_connectivity,np.abs(dfof_corr), 
          gridsize=(10,10),
          norm=colors.LogNorm())
ax.scatter(sqrt_connectivity,np.abs(dfof_corr))
ax.plot(x_new, y_new, color="r")

/usr/local/lib/python3.7/site-packages/sklearn/linear_model/base.py:509: RuntimeWarning: internal gelsd driver lwork query error, required iwork dimension not returned. This is likely the result of LAPACK bug 0038, fixed in LAPACK 3.2.2 (released July 21, 2010). Falling back to 'gelss' driver.
  linalg.lstsq(X, y)

[<matplotlib.lines.Line2D at 0x12011db38>]


model.score(sqrt_connectivity.reshape(-1,1), np.abs(dfof_corr).reshape(-1,1))

0.020443230361266274


connect_bool = np.ndarray((np.ravel(naive_adj.values).shape),
                         dtype=np.bool)
connect_bool[np.where(np.ravel(naive_adj.values))] = True


fpr, tpr, thresholds = roc_curve(connect_bool,
                                 np.ravel(np.abs(dfof_corr)))


fig, ax = plt.subplots()

ax.plot([0,1], [0,1], 'k--')
ax.plot(fpr, tpr)
ax.set_xlabel('False positive rate')
ax.set_ylabel('True positive rate')

Text(0,0.5,'True positive rate')


roc_auc_score(connect_bool, np.ravel(np.abs(dfof_corr)))

0.5407500817336883

Parse the heinous .mat Ca data¶

Load that connectome!¶

Plot the results...¶