import itertools
import numpy as np
import os
import pandas as pd
import statsmodels.api as sm
import mygene
from missingpy import KNNImputer
from . import exceptions as e
[docs]class RNAAgeCalc(object):
'''Calculate RNA age.
Attributes:
- tissue: a string indicates which tissue the gene expression data
is obtained from. Users are expected to provide one of the these tissues:
adipose_tissue, adrenal_gland, blood, blood_vessel, brain, breast, colon,
esophagus, heart, liver, lung, muscle, nerve, ovary, pancreas, pituitary,
prostate, salivary_gland, skin, small_intestine, spleen, stomach, testis,
thyroid, uterus, vagina.
If the tissue argument is not provide or the provided tissue is not in
this list, then the age predictor trained on all tissues will be used to
calculate RNA age.
- exprtype: either "count" or "FPKM". Default is "FPKM". For RPKM data,
please use exprtype="FPKM".
- idtype: a string which indicates the gene id type in "exprdata". It
should be one of "symbol", "ensembl.gene", "entrezgene" or "refseq".
Default is "symbol".
- stype: a string which specifies which version of pre-trained
calculators to be used. It should be either "all" or "Caucasian". "all"
means samples from all races (American Indian/Alaska Native, Asian,
Black/African American, and Caucasian) are used to obtain the pre-trained
calculator. "Caucasian" means only the Caucasian samples are used to build
up the pre-trained calculator. Default is "all".
- signature: a string which indicate the age signature to use when
calculating RNA age. This argument is not required.
In the case that this argument is not provided, if `tissue` argument is
also provided and the tissue is in the list above, the tissue specific age
signature given by our DESeq2 analysis result on GTEx data will be used.
Otherwise, the across tissue signature "GTExAge" will be used.
In the case that this argument is provided, it should be one of the
following signatures. A detailed description of the meaning of these
signatures is given in the package tutorial.
DESeq2, Pearson, Dev, deMagalhaes, GenAge, GTExAge, Peters, all
'''
def __init__(self, tissue=None, exprtype="FPKM", idtype="symbol",
stype="all", signature=None):
# check input
self._check_exprtype(exprtype)
self._exprtype = exprtype
self._check_idtype(idtype)
self._idtype = idtype
self._check_stype(stype)
self._stype = stype
# end check input
# process tissue and signature argument
if tissue is None:
print("tissue is not provided, using the RNA age predictor "
"trained by all tissues automatically.")
tissue = "all_tissues"
if signature is None:
print("signature is not provided, using GTExAge signature "
"automatically.")
signature = "GTExAge"
else:
assert signature in self._sig_set(), \
"signature should be one of DESeq2, Pearson, Dev, " \
"deMagalhaes, GenAge, GTExAge, Peters, all."
if signature == "DESeq":
signature = "DESeq2"
if signature == "DESeq2":
print("DESeq2 signature is currently not available for all "
"tissues, using Pearson signature automatically.")
signature = "Pearson"
else:
assert isinstance(tissue, str), "tissue should be a string."
if tissue.lower() in self._tissue_set():
tissue = tissue.lower()
if signature is None:
print("signature is not provided, using DESeq2 signature "
"automatically.")
signature = "DESeq2"
else:
assert signature in self._sig_set(), \
"signature should be one of DESeq2, Pearson, Dev, " \
"deMagalhaes, GenAge, GTExAge, Peters, all."
if signature == "DESeq":
signature = "DESeq2"
else:
print("the provided tissue was not found, using the RNA age "
"predictor trained by all tissues automatically.")
tissue = "all_tissues"
if signature is None:
print("signature is not provided, using GTExAge "
"signature automatically.")
signature = "GTExAge"
else:
assert signature in self._sig_set(), \
"signature should be one of DESeq2, Pearson, Dev, " \
"deMagalhaes, GenAge, GTExAge, Peters, all."
if signature == "DESeq":
signature = "DESeq2"
if signature == "DESeq2":
print("DESeq2 signature is currently not available for "
"all tissues, using Pearson signature "
"automatically.")
signature = "Pearson"
self._tissue = tissue
self._signature = signature
# end: process tissue and signature argument
@property
def exprtype(self):
return self._exprtype
@exprtype.setter
def exprtype(self, exprtype):
self._check_exprtype(exprtype)
self._exprtype = exprtype
@staticmethod
def _check_exprtype(exprtype):
assert isinstance(exprtype, str), "exprtype should be a string."
assert exprtype in ["count", "FPKM"], "exprtype should be count " \
"or FPKM."
@property
def idtype(self):
return self._idtype
@idtype.setter
def idtype(self, idtype):
self._check_idtype(idtype)
self._idtype = idtype
@staticmethod
def _check_idtype(idtype):
assert isinstance(idtype, str), "idtype should be a string."
assert idtype in ["symbol", "ensembl.gene", "entrezgene", "refseq"], \
"idtype should be one of symbol, ensembl.gene, entrezgene" \
" or refseq."
@property
def stype(self):
return self._stype
@stype.setter
def stype(self, stype):
self._check_stype(stype)
self._stype = stype
@staticmethod
def _check_stype(stype):
assert isinstance(stype, str), "stype should be a string."
stype = stype.lower()
assert stype in ["all", "caucasian"], \
"stype should be either all or Caucasian."
@property
def tissue(self):
return self._tissue
@tissue.setter
def tissue(self, tissue):
raise e.SetterError("Cannot set tissue, please construct a new "
"RNAAgeCalc object and use the tissue argument"
" to set tissue. ")
@property
def signature(self):
return self._signature
@signature.setter
def signature(self, signature):
raise e.SetterError("Cannot set signature, please construct a new "
"RNAAgeCalc object and use the signature argument"
" to set signature. ")
def _processlength(self, rawcount, genelength):
if genelength is None:
location = os.path.dirname(os.path.realpath(__file__))
lengthDB = pd.read_csv(os.path.join(location, "internal_data",
"lengthDB.csv"), index_col=0)
if self.idtype != "ensembl.gene":
# convert the gene id to ensembl to match the gene id in gene
# length database
mg = mygene.MyGeneInfo()
genes = list(rawcount.index)
print("converting the gene id to ensembl to match the gene id "
"in gene length database. This may take some time.")
temp = mg.querymany(genes, scopes='symbol',
fields='ensembl.gene',
species='human', returnall=True,
as_dataframe=True)["out"]
temp1 = temp.loc[~temp["ensembl.gene"].isna(), "ensembl.gene"]
temp2 = temp.loc[~temp["ensembl"].isna(), "ensembl"]
index_expand = []
value_expand = []
for index, value in temp2.iteritems():
index_expand.append([index]*len(value))
value_expand.append(list(map(lambda x: x["gene"], value)))
temp3 = pd.Series(
list(itertools.chain.from_iterable(value_expand)),
index=list(itertools.chain.from_iterable(index_expand)))
temp1 = temp1.append(temp3)
temp1 = temp1[temp1.isin(lengthDB.index)]
temp1 = temp1[~temp1.index.duplicated(keep="first")]
temp1 = temp1.drop_duplicates(keep=False)
genename = temp1[rawcount.index]
genename[genename.isnull()] = "unknown"
genelength = lengthDB.loc[genename]
else:
genelength = lengthDB.loc[rawcount.index]
else:
bool1 = isinstance(genelength, pd.DataFrame) or \
isinstance(genelength, pd.Series) or \
isinstance(genelength, list) or \
isinstance(genelength, np.ndarray)
assert bool1, "genelength should be a pandas DataFrame, Series, "\
"numpy array or list."
if isinstance(genelength, pd.DataFrame):
assert genelength.shape[0] == rawcount.shape[0], \
"The number of rows in genelength should be the same as " \
"that of rawcount."
if genelength.shape[1] > 1:
print("genelength contains more than 1 column, only the"
" 1st column will be used.")
genelength = pd.DataFrame(genelength.iloc[:, 0])
assert genelength.applymap(np.isreal).all().bool()
elif isinstance(genelength, pd.Series):
assert len(genelength) == rawcount.shape[0], \
"The length of genelength should be the same as " \
"number of rows in rawcount."
genelength = pd.DataFrame(genelength)
assert genelength.applymap(np.isreal).all().bool()
elif isinstance(genelength, list):
assert all(isinstance(item, float) or isinstance(item, int)
for item in genelength)
assert len(genelength) == rawcount.shape[0], \
"The length of genelength should be the same as " \
"number of rows in rawcount."
genelength = pd.DataFrame(genelength)
else:
if genelength.shape[0] > 1:
print("numpy array has more than 1 dimension, only the "
"1st dimension will be used.")
genelength = genelength[0]
assert np.isreal(genelength)
genelength = pd.DataFrame(genelength).transpose()
return genelength
def _count2FPKM(self, rawcount, genelength):
"""Convert gene expression data from raw count to FPKM.
This function converts gene expression data from raw count to FPKM.
:param rawcount: a pandas DataFrame which contains gene expression
count data.
:param genelength: a pandas Series, DataFrame, numpy array, or list
which contains gene length in bp. The size of genelength should be
equal to the number of rows in exprdata. This argument is optional.
If using exprtype="FPKM", genelength argument is ignored. If using
exprtype="count", the raw count will be converted to FPKM. If
genelength is provided, the function will convert raw count to FPKM
according to the user-supplied gene length. Otherwise, gene length
is obtained from the internal database.
:return: a pandas DataFrame contains FPKM.
"""
genelength = self._processlength(rawcount, genelength)
assert (genelength.iloc[:, 0].dropna() > 0).all(), \
"genelength cannot contain negative value(s) or 0."
num_nas = genelength.iloc[:, 0].isna().sum()
if num_nas > 0:
print("Can't find gene length for {:.2f}% genes when converting "
"raw count to FPKM.".format(num_nas/genelength.shape[0]*100))
countsum = rawcount.apply(sum)
bg = pd.DataFrame(columns=rawcount.columns, index=rawcount.index)
bg.loc[:, :] = countsum.values
wid = pd.DataFrame(columns=rawcount.columns, index=rawcount.index)
wid.loc[:, :] = genelength.values
return rawcount / (wid/1000) / (bg/1e6)
[docs] def predict_age(self, exprdata, genelength=None, chronage=None):
"""Calculate RNA age.
This function calculates RNA age based on pre-trained predictors.
:param exprdata: a pandas DataFrame which contains gene expression data
with each row represents a gene and each column represents a sample.
Use the argument "exprtype" to specify raw count or FPKM. The index of
"exprdata" should be gene ids and columns names of "exprdata" should
be sample ids.
:param genelength: a pandas Series, DataFrame, numpy array, or list
which contains gene length in bp. The size of genelength should be
equal to the number of rows in exprdata. This argument is optional.
If using exprtype="FPKM", genelength argument is ignored. If using
exprtype="count", the raw count will be converted to FPKM. If
genelength is provided, the function will convert raw count to FPKM
according to the user-supplied gene length. Otherwise, gene length
is obtained from the internal database.
:param chronage: a pandas DataFrame which contains the chronological
age of each sample. This argument is optional. If provided, it should
be a DataFrame with 1st column sample id and 2nd column chronological
age. The sample order in chronage doesn't have to be in the same order
as in exprdata. However, the samples in chronage and exprdata should
be the same. If some samples' chronological age are not available,
users are expected to set the chronological age in chronage to NaN.
If chronage contains more than 2 columns, only the first 2 columns
will be considered. If this argument is not provided, the age
acceleration residual will not be calculated. See package tutorial
for the definition of age acceleration residual.
:return: a pandas DataFrame contains RNA age.
"""
# check input:
assert isinstance(exprdata, pd.DataFrame), \
"exprdata should be a pandas DataFrame."
assert exprdata.applymap(np.isreal).all().all(),\
"Only numeric values are allowed in the exprdata DataFrame."
assert list(exprdata.index) != list(range(exprdata.shape[0])), \
"The index of exprdata should be gene ids."
assert list(exprdata.columns) != list(range(exprdata.shape[1])), \
"The column names of exprdata should be sample ids."
assert ~np.any(exprdata.index.duplicated()), \
"Duplicated gene names found in exprdata."
assert (exprdata >= 0).all().all(), \
"Gene expression data cannot contain negative value(s)."
if chronage is not None:
assert isinstance(chronage, pd.DataFrame), \
"chronage should be a pandas DataFrame."
if(chronage.shape[1] > 2):
print("More than 2 columns are provided in chronage. "
"Only the first 2 columns will be used.")
# assert ~chronage.applymap(np.isreal).all()[0], \
# "The 1st column in chronage should be sample ids."
assert chronage.applymap(np.isreal).all()[1], \
"The 2nd column in chronage should be chronological age."
assert ~any(chronage.iloc[:, 0].duplicated()), \
"chronage contains duplicated sample ids."
assert set(chronage.iloc[:, 0].astype(str)) == \
set(exprdata.columns), \
"Samples in chronage and exprdata should be the same."
assert ~np.any(chronage.iloc[:, 1] < 0), \
"Chronological age contains negative value(s)."
if self._exprtype == "count":
exprdata = self._count2FPKM(exprdata, genelength)
if self.idtype != "symbol":
mg = mygene.MyGeneInfo()
genes = list(exprdata.index)
temp = mg.querymany(genes, scopes=self.idtype, fields='symbol',
species='human', returnall=True,
as_dataframe=True)["out"]
temp = temp.loc[~temp["symbol"].isna(), "symbol"]
temp = temp[~temp.index.duplicated(keep="first")]
temp = temp.drop_duplicates(keep=False)
genesymbol = temp[exprdata.index]
genesymbol[genesymbol.isna()] = "unknown"
exprdata.index = genesymbol
location = os.path.dirname(os.path.realpath(__file__))
if self.stype == "all":
tempPath = os.path.join(location, "internal_data", "all",
"coef_{}_{}.csv".format(self._tissue,
self._signature))
else:
tempPath = os.path.join(location, "internal_data", "Caucasian",
"coef_{}_{}.csv".format(self._tissue,
self._signature))
sig_internal = pd.read_csv(tempPath, index_col=0)
genes_required = sig_internal.index[1:]
sig_in_expr = genes_required.isin(exprdata.index)
# full NA row
if np.sum(~sig_in_expr) != 0:
print("{:.2f}% genes in the gene signature are not included in "
"the supplied gene expression."
.format(np.sum(~sig_in_expr)/len(genes_required)*100))
# impute the gene expression in the log scale
tempmat = pd.DataFrame(columns=exprdata.columns,
index=genes_required[~sig_in_expr])
exprdata_withNA = pd.concat([exprdata, tempmat], axis=0)
exprdata_log = np.log2(exprdata_withNA.apply(pd.to_numeric) + 1)
ind1 = exprdata_log.isna().all(axis=1)
ind2 = ~exprdata_log.isna().any(axis=1)
exprdata_log.loc[(ind1 | ind2), :] = \
exprdata_log.loc[(ind1 | ind2), :].fillna(exprdata_log.mean())
else:
exprdata_log = np.log2(exprdata.apply(pd.to_numeric) + 1)
# check partial NA
if ~exprdata_log.notnull().all().all():
# impute the gene expression in the log scale
imputer = KNNImputer(n_neighbors=min(10, exprdata_log.shape[1]),
row_max_missing=1, col_max_missing=1)
X_imputed = imputer.fit_transform(exprdata_log.transpose())
exprdata_log_impute = pd.DataFrame(X_imputed).transpose()
exprdata_log_impute.index = exprdata_log.index
exprdata_sub = exprdata_log_impute.loc[genes_required, :]
else:
exprdata_sub = exprdata_log.loc[genes_required, :]
RNAAge = exprdata_sub.apply(
lambda x: np.sum(x.multiply(sig_internal.iloc[1:, 0])) +
sig_internal.iloc[0, 0])
res = pd.DataFrame(index=exprdata.columns)
res["RNAAge"] = list(RNAAge)
if chronage is not None:
chronage.index = chronage.iloc[:, 0]
res["ChronAge"] = chronage.loc[res.index].iloc[:, 1]
# if sample size is too small, age acceleration residual
# cannot be calculated
if res.dropna().shape[0] > 30:
Y = res["RNAAge"]
X = res["ChronAge"]
X = sm.add_constant(X)
model = sm.OLS(Y, X).fit()
res["AgeAccelResid"] = model.resid
return res
@staticmethod
def _tissue_set():
return set(["adipose_tissue", "adrenal_gland", "blood", "blood_vessel",
"brain", "breast", "colon", "esophagus", "heart", "liver",
"lung", "muscle", "nerve", "ovary", "pancreas",
"pituitary", "prostate", "salivary_gland", "skin",
"small_intestine", "spleen", "stomach", "testis",
"thyroid", "uterus", "vagina", "all_tissues"])
@staticmethod
def _sig_set():
return set(["DESeq", "DESeq2", "Pearson", "Dev", "deMagalhaes",
"GenAge", "GTExAge", "Peters", "all"])
def __repr__(self):
return """RNAAgeCalc(tissue=%r, exprtype=%r, idtype=%r, stype=%r,
signature=%r)""" % (self.tissue, self.exprtype, self.idtype,
self.stype, self.signature)
def __str__(self):
return """RNAAgeCalc
tissue: {}
exprtype: {}
idtype: {}
stype: {}
signature: {}""".format(self.tissue, self.exprtype, self.idtype,
self.stype, self.signature)