How to use indigopy
Example code for how to use the indigopy
package.
Set up environment
[1]:
# Import dependencies
import pandas as pd
from scipy.stats import spearmanr
from sklearn.metrics import r2_score, classification_report
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
import plotly.graph_objects as go
# Import package functions
import sys
sys.path.append('c:/Users/User/github/INDIGOpy/') # modify if testing locally in different machine; remove once package is published
from indigopy.core import load_sample, featurize, classify
Example: E. coli
[2]:
# Load sample data
sample = load_sample('ecoli')
# Define input arguments
key = sample['key']
profiles = sample['profiles']
feature_names = sample['feature_names']
train_ixns = sample['train']['interactions']
train_scores = sample['train']['scores']
test_ixns = sample['test']['interactions']
test_scores = sample['test']['scores']
# Determine ML features
train_data = featurize(train_ixns, profiles, feature_names=feature_names, key=key, silent=True)
test_data = featurize(test_ixns, profiles, feature_names=feature_names, key=key, silent=True)
X_train, X_test = train_data['feature_df'].to_numpy().transpose(), test_data['feature_df'].to_numpy().transpose()
[5]:
print(test_data['feature_df'])
AMK + FUS AMK + RIF AMK + SPE AMK + VAN CEF + FUS \
sigma-neg-ECK1963-HCHA 0.0 0.0 0.0 1.0 0.0
sigma-neg-ECK1488-PQQL 0.0 0.0 0.0 0.0 0.0
sigma-neg-ECK0813-YBIW 0.0 0.0 0.0 0.0 0.0
sigma-neg-ECK2456-EUTP 0.0 0.0 0.0 0.0 0.0
sigma-neg-ECK3716-BGLG 0.0 0.0 0.0 0.0 0.0
... ... ... ... ... ...
delta-pos-ECK2585-KGTP 0.0 0.0 0.0 0.0 0.0
delta-pos-ECK1179-CVRA 0.0 0.0 0.0 0.0 0.0
delta-pos-ECK3002-YQHC 0.0 0.0 0.0 0.0 0.0
delta-pos-ECK3103-TDCE 0.0 0.0 0.0 0.0 0.0
delta-pos-ECK3742-RBSD 0.0 0.0 0.0 0.0 0.0
CEF + RIF CEF + SPE CEF + VAN CHL + FUS CHL + RIF \
sigma-neg-ECK1963-HCHA 0.0 0.0 1.0 0.0 0.0
sigma-neg-ECK1488-PQQL 0.0 0.0 0.0 0.0 0.0
sigma-neg-ECK0813-YBIW 0.0 0.0 0.0 0.0 0.0
sigma-neg-ECK2456-EUTP 0.0 0.0 0.0 0.0 0.0
sigma-neg-ECK3716-BGLG 0.0 0.0 0.0 0.0 0.0
... ... ... ... ... ...
delta-pos-ECK2585-KGTP 0.0 0.0 0.0 0.0 0.0
delta-pos-ECK1179-CVRA 0.0 0.0 0.0 0.0 0.0
delta-pos-ECK3002-YQHC 0.0 0.0 0.0 0.0 0.0
delta-pos-ECK3103-TDCE 0.0 0.0 0.0 1.0 1.0
delta-pos-ECK3742-RBSD 0.0 0.0 0.0 1.0 1.0
... RIF + TOB RIF + TMP RIF + VAN SPE + TET \
sigma-neg-ECK1963-HCHA ... 0.0 0.0 1.0 0.0
sigma-neg-ECK1488-PQQL ... 0.0 0.0 0.0 0.0
sigma-neg-ECK0813-YBIW ... 0.0 1.0 0.0 0.0
sigma-neg-ECK2456-EUTP ... 0.0 0.0 0.0 0.0
sigma-neg-ECK3716-BGLG ... 0.0 0.0 0.0 0.0
... ... ... ... ... ...
delta-pos-ECK2585-KGTP ... 0.0 0.0 0.0 0.0
delta-pos-ECK1179-CVRA ... 0.0 0.0 0.0 0.0
delta-pos-ECK3002-YQHC ... 0.0 0.0 0.0 1.0
delta-pos-ECK3103-TDCE ... 0.0 0.0 0.0 0.0
delta-pos-ECK3742-RBSD ... 0.0 0.0 0.0 0.0
SPE + TOB SPE + TMP SPE + VAN TET + VAN TOB + VAN \
sigma-neg-ECK1963-HCHA 0.0 0.0 1.0 1.0 1.0
sigma-neg-ECK1488-PQQL 0.0 0.0 0.0 0.0 0.0
sigma-neg-ECK0813-YBIW 0.0 1.0 0.0 0.0 0.0
sigma-neg-ECK2456-EUTP 0.0 0.0 0.0 0.0 0.0
sigma-neg-ECK3716-BGLG 0.0 0.0 0.0 0.0 0.0
... ... ... ... ... ...
delta-pos-ECK2585-KGTP 0.0 0.0 0.0 0.0 0.0
delta-pos-ECK1179-CVRA 0.0 0.0 0.0 0.0 0.0
delta-pos-ECK3002-YQHC 0.0 0.0 0.0 1.0 0.0
delta-pos-ECK3103-TDCE 0.0 0.0 0.0 0.0 0.0
delta-pos-ECK3742-RBSD 0.0 0.0 0.0 0.0 0.0
TMP + VAN
sigma-neg-ECK1963-HCHA 1.0
sigma-neg-ECK1488-PQQL 0.0
sigma-neg-ECK0813-YBIW 1.0
sigma-neg-ECK2456-EUTP 0.0
sigma-neg-ECK3716-BGLG 0.0
... ...
delta-pos-ECK2585-KGTP 0.0
delta-pos-ECK1179-CVRA 0.0
delta-pos-ECK3002-YQHC 0.0
delta-pos-ECK3103-TDCE 0.0
delta-pos-ECK3742-RBSD 0.0
[15916 rows x 66 columns]
[ ]:
# Determine class labels
thresh, classes = (-0.5, 2), ('S', 'N', 'A')
train_labels = classify(train_scores, thresholds=thresh, classes=classes)
test_labels = classify(test_scores, thresholds=thresh, classes=classes)
# Train and apply a regression-based model
reg_model = RandomForestRegressor()
reg_model.fit(X_train, train_scores)
reg_y = reg_model.predict(X_test)
r, p = spearmanr(test_scores, reg_y)
r2 = r2_score(test_scores, reg_y)
print('Regression results:')
print('\tSpearman R = {}'.format(round(r, 4)))
print('\tSpearman p = {:.3g}'.format(p))
print('\tR2 = {}'.format(round(r2, 4)))
# Train and apply a classification-based model
class_model = RandomForestClassifier()
class_model.fit(X_train, train_labels)
class_y = class_model.predict(X_test)
print('Classification results:')
print(classification_report(test_labels, class_y))
Example: M. tuberculosis
[3]:
# Load sample data
sample = load_sample('mtb')
# Define input arguments
key = sample['key']
profiles = sample['profiles']
feature_names = sample['feature_names']
train_ixns = sample['train']['interactions']
train_scores = sample['train']['scores']
test_ixns = sample['test']['interactions']
test_scores = sample['test']['scores']
clinical_ixns = sample['clinical']['interactions']
clinical_scores = sample['clinical']['scores']
# Determine ML features
train_data = featurize(train_ixns, profiles, feature_names=feature_names, key=key, silent=True)
test_data = featurize(test_ixns, profiles, feature_names=feature_names, key=key, silent=True)
clinical_data = featurize(clinical_ixns, profiles, feature_names=feature_names, key=key, silent=True)
X_train, X_test = train_data['feature_df'].to_numpy().transpose(), test_data['feature_df'].to_numpy().transpose()
X_clinical = clinical_data['feature_df'].to_numpy().transpose()
# Determine class labels
thresh, classes = (0.9, 1.1), ('S', 'N', 'A')
train_labels = classify(train_scores, thresholds=thresh, classes=classes)
test_labels = classify(test_scores, thresholds=thresh, classes=classes)
# Train and apply a regression-based model
reg_model = RandomForestRegressor()
reg_model.fit(X_train, train_scores)
reg_y = reg_model.predict(X_test)
r, p = spearmanr(test_scores, reg_y)
r2 = r2_score(test_scores, reg_y)
print('Regression results:')
print('\tSpearman R = {}'.format(round(r, 4)))
print('\tSpearman p = {:.3g}'.format(p))
print('\tR2 = {}'.format(round(r2, 4)))
# Train and apply a classification-based model
class_model = RandomForestClassifier()
class_model.fit(X_train, train_labels)
class_y = class_model.predict(X_test)
print('Classification results:')
print(classification_report(test_labels, class_y))
# Apply model to clinical data
clinical_y = reg_model.predict(X_clinical)
r, p = spearmanr(clinical_scores, clinical_y)
print('Clinical results:')
print('\tSpearman R = {}'.format(round(-r, 4)))
print('\tSpearman p = {:.3g}'.format(p))
Regression results:
Spearman R = 0.5883
Spearman p = 0.000161
R2 = 0.1507
Classification results:
precision recall f1-score support
A 0.67 0.38 0.48 16
N 0.00 0.00 0.00 1
S 0.73 0.84 0.78 19
accuracy 0.61 36
macro avg 0.46 0.41 0.42 36
weighted avg 0.68 0.61 0.63 36
Clinical results:
Spearman R = 0.5607
Spearman p = 5.74e-06
Example: S. aureus
[4]:
# Load sample data
sample = load_sample('saureus')
# Define input arguments
key = sample['key']
profiles = sample['profiles']
feature_names = sample['feature_names']
train_ixns = sample['train']['interactions']
train_scores = sample['train']['scores']
test_ixns = sample['test']['interactions']
test_scores = sample['test']['scores']
strains = sample['orthology']['strains']
orthology_map = sample['orthology']['map']
# Determine ML features
train_data = featurize(train_ixns, profiles, feature_names=feature_names, key=key, silent=True)
test_data = featurize(test_ixns, profiles, feature_names=feature_names, key=key, silent=True)
X_train, X_test = train_data['feature_df'].to_numpy().transpose(), test_data['feature_df'].to_numpy().transpose()
# Determine class labels
thresh, classes = (-0.5, 2), ('S', 'N', 'A')
train_labels = classify(train_scores, thresholds=thresh, classes=classes)
test_labels = classify(test_scores, thresholds=thresh, classes=classes)
# Train and apply a regression-based model
reg_model = RandomForestRegressor()
reg_model.fit(X_train, train_scores)
reg_y = reg_model.predict(X_test)
r, p = spearmanr(test_scores, reg_y)
r2 = r2_score(test_scores, reg_y)
print('Regression results:')
print('\tSpearman R = {}'.format(round(r, 4)))
print('\tSpearman p = {:.3g}'.format(p))
print('\tR2 = {}'.format(round(r2, 4)))
# Orthology version
test_data_ortho = featurize(test_ixns, profiles, feature_names=feature_names, key=key,
strains=strains, orthology_map=orthology_map, silent=True)
X_test_ortho = test_data_ortho['feature_df'].to_numpy().transpose()
reg_y_ortho = reg_model.predict(X_test_ortho)
r, p = spearmanr(test_scores, reg_y_ortho)
r2 = r2_score(test_scores, reg_y_ortho)
print('Regression results (with orthology):')
print('\tSpearman R = {}'.format(round(r, 4)))
print('\tSpearman p = {:.3g}'.format(p))
print('\tR2 = {}'.format(round(r2, 4)))
# Train and apply a classification-based model
class_model = RandomForestClassifier()
class_model.fit(X_train, train_labels)
class_y = class_model.predict(X_test)
print('Classification results:')
print(classification_report(test_labels, class_y))
Regression results:
Spearman R = 0.478
Spearman p = 0.000898
R2 = -0.3188
Regression results (with orthology):
Spearman R = 0.5756
Spearman p = 3.52e-05
R2 = -1.2016
Classification results:
precision recall f1-score support
A 0.33 0.50 0.40 2
N 0.50 0.77 0.61 22
S 0.62 0.24 0.34 21
accuracy 0.51 45
macro avg 0.49 0.50 0.45 45
weighted avg 0.55 0.51 0.48 45
Example: A. baumannii
[3]:
# Load sample data
sample = load_sample('abaumannii')
# Define input arguments
key = sample['key']
profiles = sample['profiles']
feature_names = sample['feature_names']
train_ixns = sample['train']['interactions']
train_scores = sample['train']['scores']
test_ixns = sample['test']['interactions']
test_scores = sample['test']['scores']
strains = sample['orthology']['strains']
orthology_map = sample['orthology']['map']
# Determine ML features
train_data = featurize(train_ixns, profiles, feature_names=feature_names, key=key, silent=True)
test_data = featurize(test_ixns, profiles, feature_names=feature_names, key=key, silent=True)
X_train, X_test = train_data['feature_df'].to_numpy().transpose(), test_data['feature_df'].to_numpy().transpose()
# Determine class labels
thresh, classes = (-0.5, 0), ('S', 'N', 'A')
train_labels = classify(train_scores, thresholds=thresh, classes=classes)
test_labels = classify(test_scores, thresholds=thresh, classes=classes)
# Train and apply a regression-based model
reg_model = RandomForestRegressor()
reg_model.fit(X_train, train_scores)
reg_y = reg_model.predict(X_test)
r, p = spearmanr(test_scores, reg_y)
r2 = r2_score(test_scores, reg_y)
print('Regression results:')
print('\tSpearman R = {}'.format(round(r, 4)))
print('\tSpearman p = {:.3g}'.format(p))
print('\tR2 = {}'.format(round(r2, 4)))
# Orthology version
test_data_ortho = featurize(test_ixns, profiles, feature_names=feature_names, key=key,
strains=strains, orthology_map=orthology_map, silent=True)
X_test_ortho = test_data_ortho['feature_df'].to_numpy().transpose()
reg_y_ortho = reg_model.predict(X_test_ortho)
r, p = spearmanr(test_scores, reg_y_ortho)
r2 = r2_score(test_scores, reg_y_ortho)
print('Regression results (with orthology):')
print('\tSpearman R = {}'.format(round(r, 4)))
print('\tSpearman p = {:.3g}'.format(p))
print('\tR2 = {}'.format(round(r2, 4)))
# Train and apply a classification-based model
class_model = RandomForestClassifier()
class_model.fit(X_train, train_labels)
class_y = class_model.predict(X_test)
print('Classification results:')
print(classification_report(test_labels, class_y))
# Visualize results
df = pd.DataFrame({
'x': test_labels,
'y': reg_y_ortho
}
)
fig = go.Figure()
fig.add_trace(go.Box(y=df.y[df.x=='A'], name='Antagonism', marker_color='red'))
fig.add_trace(go.Box(y=df.y[df.x=='N'], name='Neutral', marker_color='gray'))
fig.add_trace(go.Box(y=df.y[df.x=='S'], name='Synergy', marker_color='blue'))
fig.update_layout(
autosize=False,
width=500,
height=500,
title='A. baumannii',
xaxis_title='True Class',
yaxis_title='Predicted Score',
showlegend=False,
plot_bgcolor='rgba(0,0,0,0)'
)
fig.show()
Regression results:
Spearman R = 0.6469
Spearman p = 1.58e-06
R2 = -0.6263
Regression results (with orthology):
Spearman R = 0.5968
Spearman p = 1.51e-05
R2 = -0.3828
Classification results:
precision recall f1-score support
A 0.47 0.82 0.60 17
N 0.17 0.09 0.12 11
S 0.78 0.41 0.54 17
accuracy 0.49 45
macro avg 0.47 0.44 0.42 45
weighted avg 0.51 0.49 0.46 45
Data type cannot be displayed: application/vnd.plotly.v1+json