This notebook records progress in the Marcelle HCS image clustering project.
As always, we start by setting matplotlib inline and importing pyplot.
%matplotlib inline
from matplotlib import pyplot as plt
Next, we set the working directory...
import os
working_dir = '~/Data/marcelle-hcs-sample/features'
os.chdir(os.path.expanduser(working_dir))
The working directory contains a pre-populated file called dirs.txt
that contains all the directories on the merri cluster in which a feature map was created. (As well as the RGB images.) We use this to copy over each feature matrix.
import paramiko as mk
with open('dirs.txt', 'r') as fin:
line = fin.readline().rstrip()
dirs = line.split(' ')
from husc.screens import myofusion as myo
reload(myo)
remote_base_dir = 'marcelle/raw-data'
plateids = map(myo.dir2plate, dirs)
feature_fns = ['features-%i.h5' % p for p in plateids]
# set up SFTP transport to fetch image files
trans = mk.Transport(('merri.vlsci.unimelb.edu.au', 22))
trans.connect(pkey=mk.Agent().keys[0], username='jni')
sftp = mk.SFTPClient.from_transport(trans)
#for d, f in zip(dirs, feature_fns):
# if not os.path.exists(f):
# sftp.get(os.path.join(remote_base_dir, d, 'features.h5'), f)
The above only needs to be run once, so we store the run code here:
for d, f in zip(dirs, feature_fns):
if not os.path.exists(f): # allow resumption of interrupted scps
time.sleep(1) # merri appears to choke with too many scps
sp.check_call(['scp', 'jni@merri.vlsci.unimelb.edu.au:' +
os.path.join(remote_base_dir, d, 'features.h5'), f])
Let's do a sanity check to ensure plate ids are unique:
print (len(dirs), len(set(dirs)),
len(plateids), len(set(plateids)))
tempdict = {}
for k in plateids:
tempdict.setdefault(k, []).append(k)
print([k for k, v in tempdict.items() if len(v) > 1])
Next, we read in all the feature maps using pandas
.
import os
import pandas as pd
import numpy as np
h5_fns = sorted(filter(lambda s: s.startswith('feature-') and
s.endswith('.h5') and not
s.endswith('NOCODE.h5'), os.listdir('.')))
datasets = [pd.read_hdf(f, 'data') for f in h5_fns]
data = pd.concat(datasets)
print(data)
Now, let's use scikit-learn to do some PCA.
from sklearn.preprocessing import StandardScaler
from sklearn import decomposition
X = StandardScaler().fit_transform(data)
pca = decomposition.PCA().fit(X)
pca.n_components = 2
X_reduced = pca.fit_transform(X)
print(X_reduced.shape)
The obligatory PCA scatter:
plt.scatter(*(X_reduced.T))
What features contributed most to the two top PCA components?
pca_comp1, pca_comp2 = (np.argsort(pca.components_[0])[-1:-4:-1],
np.argsort(pca.components_[1])[-1:-4:-1])
data.columns[pca_comp1], data.columns[pca_comp2]
Let's cluster the data (using KMeans):
from sklearn import cluster
# average cluster size of 10
avg_cluster_size = 20
K = cluster.MiniBatchKMeans(n_clusters=(X.shape[0] // avg_cluster_size))
K = K.fit(X)
Now, we can find out which images clustered together and which interfering RNAs they correspond to.
from husc import preprocess as pre
reload(pre)
myores_semantic_filename = myo.myores_semantic_filename
plate2dir = myo.make_plate2dir_dict(dirs)
sems = map(myo.myores_semantic_filename, data.index)
data.index = [(s['plate'], s['well']) for s in sems]
plates = set([ix[0] for ix in data.index])
print plates
The gene data has been pre-loaded into a MongoDB database. It maps all plates and wells to genes and images. (See husc.screens.myofusion.populate_db
.)
from pymongo import MongoClient
collection = MongoClient()["myofusion"]["wells"]
def get_cluster_gene_data(cluster_id):
cluster_members = [data.index[i] for
i in np.flatnonzero(K.labels_ == cluster_id)]
cluster_entries = [collection.find_one({"_id": myo.key2mongo(c)})
for c in cluster_members]
return cluster_members, cluster_entries
print(len(K.labels_), len(np.unique(K.labels_)), np.max(K.labels_))
plt.plot(np.bincount(K.labels_))
cluster_members_1, cluster_entries_1 = get_cluster_gene_data(17)
print(len(cluster_members_1))
cluster_entries_1[:3]
We can use the plate and well numbers to grab the images that were analysed.
The database contains the locations of each file in merri. We use our previously established paramiko connection (held by the sftp
object) to grab files on request.
import tempfile as tmp
import mahotas as mh
def get_doc_value(mongo_doc, key):
try:
return mongo_doc[key], False
except KeyError:
return "", True
def get_images(key, key_type='_id',
ids_returned=["_id", "gene_name"],
failsafe_ids=["label"]):
if key_type == "_id" and type(key) == tuple:
key = myo.key2mongo(key)
im_entries = collection.find({key_type: key})
ims = []
ids = []
for entry in im_entries:
im_path = entry["filename"]
tmpfile = tmp.mkstemp(suffix='.' + im_path.split('.')[-1],
dir='.')[1]
sftp.get(im_path, tmpfile)
ims.append(mh.imread(tmpfile))
id_list = []
fail = False
for ix in ids_returned:
val, fail = get_doc_value(entry, ix)
id_list.append(val)
if fail:
for ix in failsafe_ids:
id_list.append(get_doc_value(entry, ix)[0])
ids.append(" ".join(map(str, id_list)))
labeled_ims = zip(ids, ims)
return labeled_ims
Now we want to look at the images. The below function shows three images side by side, but can be generalised to show a grid of any number of images.
def show_3ims(label_ims):
num_rows = int(np.ceil(float(len(label_ims)) / 3))
fig, axes = plt.subplots(num_rows, 3, figsize=(15, 5 * num_rows))
for (name, im), ax in zip(label_ims, axes.ravel()):
ax.imshow(im)
ax.set_title(name)
cluster_members_1
try:
ims = [get_images(k)[0] for k in cluster_members_1]
except:
print([c["filename"] for c in cluster_entries_1])
raise
show_3ims(ims)