In [3]:

import paramiko as mk

with open('dirs.txt', 'r') as fin:
    line = fin.readline().rstrip()
    dirs = line.split(' ')

from husc.screens import myofusion as myo
reload(myo)

remote_base_dir = 'marcelle/raw-data'
plateids = map(myo.dir2plate, dirs)
feature_fns = ['features-%i.h5' % p for p in plateids]

# set up SFTP transport to fetch image files
trans = mk.Transport(('merri.vlsci.unimelb.edu.au', 22))
trans.connect(pkey=mk.Agent().keys[0], username='jni')
sftp = mk.SFTPClient.from_transport(trans)

#for d, f in zip(dirs, feature_fns):
#    if not os.path.exists(f):
#        sftp.get(os.path.join(remote_base_dir, d, 'features.h5'), f)

Plate ID NOCODE cannot be converted to int, replaced with 0.
Plate ID NOCODE cannot be converted to int, replaced with 0.

The above only needs to be run once, so we store the run code here:

for d, f in zip(dirs, feature_fns):
    if not os.path.exists(f): # allow resumption of interrupted scps
        time.sleep(1) # merri appears to choke with too many scps
        sp.check_call(['scp', 'jni@merri.vlsci.unimelb.edu.au:' +
                 os.path.join(remote_base_dir, d, 'features.h5'), f])

Let's do a sanity check to ensure plate ids are unique:

In [4]:

print (len(dirs), len(set(dirs)),
       len(plateids), len(set(plateids)))
tempdict = {}
for k in plateids:
    tempdict.setdefault(k, []).append(k)
print([k for k, v in tempdict.items() if len(v) > 1])

(186, 186, 186, 185)
[0]

Data input

Next, we read in all the feature maps using pandas.

In [5]:

import os
import pandas as pd
import numpy as np

In [6]:

h5_fns = sorted(filter(lambda s: s.startswith('feature-') and
                       s.endswith('.h5') and not
                       s.endswith('NOCODE.h5'), os.listdir('.')))
datasets = [pd.read_hdf(f, 'data') for f in h5_fns]

In [7]:

data = pd.concat(datasets)
print(data)

<class 'pandas.core.frame.DataFrame'>
Index: 1680 entries, /scratch/merri/jobs/1489812/MYORES-p1-j01-110210_02490688_53caa10e-ac15-4166-9b9d-4b1167f3b9c6_C03_w1_stitched.chs.tif to /scratch/merri/jobs/1489829/MYORES-p1-j01-110210_02490705_b2c8bbb1-f56f-4e20-b344-b31536523085_N22_w1_stitched.chs.tif
Columns: 291 entries, mcf-otsu-threshold-num-objs to adapt-cells-with-11-nuclei
dtypes: float64(291)

/Users/nuneziglesiasj/venv/husc/lib/python2.7/site-packages/pandas/core/config.py:570: DeprecationWarning: height has been deprecated.

  warnings.warn(d.msg, DeprecationWarning)

Initial exploratory analysis

Now, let's use scikit-learn to do some PCA.

In [8]:

from sklearn.preprocessing import StandardScaler
from sklearn import decomposition

In [9]:

X = StandardScaler().fit_transform(data)
pca = decomposition.PCA().fit(X)
pca.n_components = 2
X_reduced = pca.fit_transform(X)
print(X_reduced.shape)

(1680, 2)

The obligatory PCA scatter:

In [10]:

plt.scatter(*(X_reduced.T))

Out[10]:

<matplotlib.collections.PathCollection at 0x111b78510>

What features contributed most to the two top PCA components?

In [11]:

pca_comp1, pca_comp2 = (np.argsort(pca.components_[0])[-1:-4:-1],
                        np.argsort(pca.components_[1])[-1:-4:-1])
data.columns[pca_comp1], data.columns[pca_comp2]

Out[11]:

(Index([u'nuclei-d-neighbor-1--percentile-95', u'nuclei-d-neighbor-2--percentile-5', u'nuclei-d-neighbor-2--percentile-25'], dtype=object),
 Index([u'cells-adaptive-threshold-num-objs', u'cells-otsu-threshold-num-objs', u'mcf-otsu-threshold-MaxIntensity-percentile25'], dtype=object))

Clustering

Let's cluster the data (using KMeans):

In [12]:

from sklearn import cluster

In [13]:

# average cluster size of 10
avg_cluster_size = 20
K = cluster.MiniBatchKMeans(n_clusters=(X.shape[0] // avg_cluster_size))
K = K.fit(X)

Mapping data rows to genes and images

Now, we can find out which images clustered together and which interfering RNAs they correspond to.

In [14]:

from husc import preprocess as pre
reload(pre)
myores_semantic_filename = myo.myores_semantic_filename

In [15]:

plate2dir = myo.make_plate2dir_dict(dirs)

Plate ID NOCODE cannot be converted to int, replaced with 0.
Plate ID NOCODE cannot be converted to int, replaced with 0.

In [16]:

sems = map(myo.myores_semantic_filename, data.index)
data.index = [(s['plate'], s['well']) for s in sems]

In [17]:

plates = set([ix[0] for ix in data.index])
print plates

set([2490688, 2490689, 2490695, 2490700, 2490701, 2490702, 2490705])

Mapping files to genes

The gene data has been pre-loaded into a MongoDB database. It maps all plates and wells to genes and images. (See husc.screens.myofusion.populate_db.)

In [18]:

from pymongo import MongoClient
collection = MongoClient()["myofusion"]["wells"]

def get_cluster_gene_data(cluster_id):
    cluster_members = [data.index[i] for
                       i in np.flatnonzero(K.labels_ == cluster_id)]
    cluster_entries = [collection.find_one({"_id": myo.key2mongo(c)})
                       for c in cluster_members]
    return cluster_members, cluster_entries

In [19]:

print(len(K.labels_), len(np.unique(K.labels_)), np.max(K.labels_))
plt.plot(np.bincount(K.labels_))

(1680, 84, 83)

Out[19]:

[<matplotlib.lines.Line2D at 0x11744e6d0>]

Some example clusters

In [20]:

cluster_members_1, cluster_entries_1 = get_cluster_gene_data(17)
print(len(cluster_members_1))
cluster_entries_1[:3]

Out[20]:

[{u'_id': u'2490695-I03',
  u'cell_plate_barcode': 2490695,
  u'cell_plate_label': u'MY-pass1-j01-MP5-2',
  u'column': u'3',
  u'experimental_content_type_name': u'SAMPLE',
  u'filename': u'marcelle/raw-data/pass1/job01/MYORES-p1-j01-110210_02490695_3900134e-f46b-4310-9306-748357a3dc3a_EXPORT/MYORES-p1-j01-110210_02490695_3900134e-f46b-4310-9306-748357a3dc3a_I03_w1_stitched.chs.tif',
  u'gene_acc': u'19053',
  u'gene_name': u'Ppp2cb',
  u'label': u'sample',
  u'molecule_design_id': u'10386683',
  u'row': u'I',
  u'source_plate_barcode': u'2490684',
  u'source_plate_label': u'MYORES_pass1_p4',
  u'well': u'I03'},
 {u'_id': u'2490700-F08',
  u'cell_plate_barcode': 2490700,
  u'cell_plate_label': u'MY-pass1-j01-MP3-3',
  u'column': u'8',
  u'experimental_content_type_name': u'SAMPLE',
  u'filename': u'marcelle/raw-data/pass1/job01/MYORES-p1-j01-110210_02490700_3d91229a-8d26-4a97-a7ae-743a66d8fd4f_EXPORT/MYORES-p1-j01-110210_02490700_3d91229a-8d26-4a97-a7ae-743a66d8fd4f_F08_w1_stitched.chs.tif',
  u'gene_acc': u'98193',
  u'gene_name': u'Dcaf8',
  u'label': u'sample',
  u'molecule_design_id': u'10386137',
  u'row': u'F',
  u'source_plate_barcode': u'2490682',
  u'source_plate_label': u'MYORES_pass1_p3',
  u'well': u'F08'},
 {u'_id': u'2490700-H16',
  u'cell_plate_barcode': 2490700,
  u'cell_plate_label': u'MY-pass1-j01-MP3-3',
  u'column': u'16',
  u'experimental_content_type_name': u'SAMPLE',
  u'filename': u'marcelle/raw-data/pass1/job01/MYORES-p1-j01-110210_02490700_3d91229a-8d26-4a97-a7ae-743a66d8fd4f_EXPORT/MYORES-p1-j01-110210_02490700_3d91229a-8d26-4a97-a7ae-743a66d8fd4f_H16_w1_stitched.chs.tif',
  u'gene_acc': u'214639',
  u'gene_name': u'4930486L24Rik',
  u'label': u'sample',
  u'molecule_design_id': u'10386154',
  u'row': u'H',
  u'source_plate_barcode': u'2490682',
  u'source_plate_label': u'MYORES_pass1_p3',
  u'well': u'H16'}]

We can use the plate and well numbers to grab the images that were analysed.

Grabbing the files from merri

The database contains the locations of each file in merri. We use our previously established paramiko connection (held by the sftp object) to grab files on request.

In [21]:

import tempfile as tmp
import mahotas as mh

def get_doc_value(mongo_doc, key):
    try:
        return mongo_doc[key], False
    except KeyError:
        return "", True

def get_images(key, key_type='_id',
               ids_returned=["_id", "gene_name"],
               failsafe_ids=["label"]):
    if key_type == "_id" and type(key) == tuple:
        key = myo.key2mongo(key)
    im_entries = collection.find({key_type: key})
    ims = []
    ids = []
    for entry in im_entries:
        im_path = entry["filename"]
        tmpfile = tmp.mkstemp(suffix='.' + im_path.split('.')[-1],
                              dir='.')[1]
        sftp.get(im_path, tmpfile)
        ims.append(mh.imread(tmpfile))
        id_list = []
        fail = False
        for ix in ids_returned:
            val, fail = get_doc_value(entry, ix)
            id_list.append(val)
        if fail:
            for ix in failsafe_ids:
                id_list.append(get_doc_value(entry, ix)[0])
        ids.append(" ".join(map(str, id_list)))
    labeled_ims = zip(ids, ims)
    return labeled_ims

Now we want to look at the images. The below function shows three images side by side, but can be generalised to show a grid of any number of images.

In [22]:

def show_3ims(label_ims):
    num_rows = int(np.ceil(float(len(label_ims)) / 3))
    fig, axes = plt.subplots(num_rows, 3, figsize=(15, 5 * num_rows))
    for (name, im), ax in zip(label_ims, axes.ravel()):
        ax.imshow(im)
        ax.set_title(name)

In [23]:

cluster_members_1

Out[23]:

[(2490695, 'I03'),
 (2490700, 'F08'),
 (2490700, 'H16'),
 (2490700, 'K11'),
 (2490701, 'C19'),
 (2490701, 'H16'),
 (2490702, 'D18'),
 (2490702, 'F08'),
 (2490702, 'F18'),
 (2490702, 'H08'),
 (2490702, 'H16'),
 (2490702, 'K20'),
 (2490705, 'C06'),
 (2490705, 'C07'),
 (2490705, 'C11'),
 (2490705, 'C14'),
 (2490705, 'D04'),
 (2490705, 'D07'),
 (2490705, 'D08'),
 (2490705, 'D12'),
 (2490705, 'E06'),
 (2490705, 'E17'),
 (2490705, 'F07'),
 (2490705, 'F12'),
 (2490705, 'F15'),
 (2490705, 'F19'),
 (2490705, 'G04'),
 (2490705, 'G06'),
 (2490705, 'G11'),
 (2490705, 'G14'),
 (2490705, 'G16'),
 (2490705, 'H05'),
 (2490705, 'H16'),
 (2490705, 'H18'),
 (2490705, 'I06'),
 (2490705, 'I07'),
 (2490705, 'I10'),
 (2490705, 'I21'),
 (2490705, 'J03'),
 (2490705, 'J05'),
 (2490705, 'J08'),
 (2490705, 'K06')]

In [24]:

try:
    ims = [get_images(k)[0] for k in cluster_members_1]
except:
    print([c["filename"] for c in cluster_entries_1])
    raise

In [25]:

show_3ims(ims)

Another example

The images look quite similar! Is this a fluke? Let's look at another cluster.

In [26]:

plate_cluster_2, gene_cluster_2 = get_cluster_gene_data(18)
plate_cluster_2

Out[26]:

[(2490688, 'E14'),
 (2490688, 'E18'),
 (2490688, 'L05'),
 (2490688, 'L20'),
 (2490689, 'J04'),
 (2490689, 'L05'),
 (2490689, 'L08'),
 (2490695, 'C21')]

In [27]:

try:
    ims2 = [get_images(k)[0] for k in plate_cluster_2]
except:
    print(c["filename"] for c in gene_cluster_2)
    raise

In [28]:

show_3ims(ims2)

Retrieving specific genes

Let's look at some images previously shown to be of interest to Christophe and his collaborator Bruno, such as Sass6, Son, and Cenpq.

In [29]:

import itertools as it

ims3 = list(it.chain(*[get_images(g, key_type="gene_name") for
                       g in ["Sass6", "Son", "Cenpq"]]))

In [30]:

show_3ims(ims3)

To be continued...

some sanity checks

distance between feature vectors of same gene should be much closer than random pairs
distance between positive controls and between negative controls should be much closer than between pairs of the two
plates and well location should not cluster together. If they do, need to correct for this

Clustering of entire dataset

a lot of "bad" that I need to investigate.
modifications to run on the cluster are done.

Gene Ontology enrichment of clusters

can we associate specific GO functions with the clusters?

Look at extreme examples of certain features

for example, what do images on either extreme of the nn-angle feature look like?

High content imaging screen clustering

Set up file mapping