Source code for racpy.core

import itertools
import numpy as np
import os
import pandas as pd
import statsmodels.api as sm
import mygene
from missingpy import KNNImputer
from . import exceptions as e


[docs]class RNAAgeCalc(object):
    '''Calculate RNA age.

    Attributes:
    - tissue: a string indicates which tissue the gene expression data
    is obtained from. Users are expected to provide one of the these tissues:

    adipose_tissue, adrenal_gland, blood, blood_vessel, brain, breast, colon,
    esophagus, heart, liver, lung, muscle, nerve, ovary, pancreas, pituitary,
    prostate, salivary_gland, skin, small_intestine, spleen, stomach, testis,
    thyroid, uterus, vagina.

    If the tissue argument is not provide or the provided tissue is not in
    this list, then the age predictor trained on all tissues will be used to
    calculate RNA age.

    - exprtype: either "count" or "FPKM". Default is "FPKM". For RPKM data,
    please use exprtype="FPKM".

    - idtype: a string which indicates the gene id type in "exprdata". It
    should be one of "symbol", "ensembl.gene", "entrezgene" or "refseq".
    Default is "symbol".

    - stype: a string which specifies which version of pre-trained
    calculators to be used. It should be either "all" or "Caucasian". "all"
    means samples from all races (American Indian/Alaska Native, Asian,
    Black/African American, and Caucasian) are used to obtain the pre-trained
    calculator. "Caucasian" means only the Caucasian samples are used to build
    up the pre-trained calculator. Default is "all".

    - signature: a string which indicate the age signature to use when
    calculating RNA age. This argument is not required.

    In the case that this argument is not provided, if `tissue` argument is
    also provided and the tissue is in the list above, the tissue specific age
    signature given by our DESeq2 analysis result on GTEx data will be used.
    Otherwise, the across tissue signature "GTExAge" will be used.

    In the case that this argument is provided, it should be one of the
    following signatures. A detailed description of the meaning of these
    signatures is given in the package tutorial.

    DESeq2, Pearson, Dev, deMagalhaes, GenAge, GTExAge, Peters, all
    '''

    def __init__(self, tissue=None, exprtype="FPKM", idtype="symbol",
                 stype="all", signature=None):
        # check input
        self._check_exprtype(exprtype)
        self._exprtype = exprtype

        self._check_idtype(idtype)
        self._idtype = idtype

        self._check_stype(stype)
        self._stype = stype
        # end check input

        # process tissue and signature argument
        if tissue is None:
            print("tissue is not provided, using the RNA age predictor "
                  "trained by all tissues automatically.")
            tissue = "all_tissues"
            if signature is None:
                print("signature is not provided, using GTExAge signature "
                      "automatically.")
                signature = "GTExAge"
            else:
                assert signature in self._sig_set(), \
                    "signature should be one of DESeq2, Pearson, Dev, " \
                    "deMagalhaes, GenAge, GTExAge, Peters, all."
                if signature == "DESeq":
                    signature = "DESeq2"
                if signature == "DESeq2":
                    print("DESeq2 signature is currently not available for all "
                          "tissues, using Pearson signature automatically.")
                    signature = "Pearson"
        else:
            assert isinstance(tissue, str), "tissue should be a string."
            if tissue.lower() in self._tissue_set():
                tissue = tissue.lower()
                if signature is None:
                    print("signature is not provided, using DESeq2 signature "
                          "automatically.")
                    signature = "DESeq2"
                else:
                    assert signature in self._sig_set(), \
                        "signature should be one of DESeq2, Pearson, Dev, " \
                        "deMagalhaes, GenAge, GTExAge, Peters, all."
                    if signature == "DESeq":
                        signature = "DESeq2"
            else:
                print("the provided tissue was not found, using the RNA age "
                      "predictor trained by all tissues automatically.")
                tissue = "all_tissues"
                if signature is None:
                    print("signature is not provided, using GTExAge "
                          "signature automatically.")
                    signature = "GTExAge"
                else:
                    assert signature in self._sig_set(), \
                        "signature should be one of DESeq2, Pearson, Dev, " \
                        "deMagalhaes, GenAge, GTExAge, Peters, all."
                    if signature == "DESeq":
                        signature = "DESeq2"
                    if signature == "DESeq2":
                        print("DESeq2 signature is currently not available for "
                              "all tissues, using Pearson signature "
                              "automatically.")
                        signature = "Pearson"
        self._tissue = tissue
        self._signature = signature
        # end: process tissue and signature argument

    @property
    def exprtype(self):
        return self._exprtype

    @exprtype.setter
    def exprtype(self, exprtype):
        self._check_exprtype(exprtype)
        self._exprtype = exprtype

    @staticmethod
    def _check_exprtype(exprtype):
        assert isinstance(exprtype, str), "exprtype should be a string."
        assert exprtype in ["count", "FPKM"], "exprtype should be count " \
                                              "or FPKM."

    @property
    def idtype(self):
        return self._idtype

    @idtype.setter
    def idtype(self, idtype):
        self._check_idtype(idtype)
        self._idtype = idtype

    @staticmethod
    def _check_idtype(idtype):
        assert isinstance(idtype, str), "idtype should be a string."
        assert idtype in ["symbol", "ensembl.gene", "entrezgene", "refseq"], \
            "idtype should be one of symbol, ensembl.gene, entrezgene" \
            " or refseq."

    @property
    def stype(self):
        return self._stype

    @stype.setter
    def stype(self, stype):
        self._check_stype(stype)
        self._stype = stype

    @staticmethod
    def _check_stype(stype):
        assert isinstance(stype, str), "stype should be a string."
        stype = stype.lower()
        assert stype in ["all", "caucasian"], \
            "stype should be either all or Caucasian."

    @property
    def tissue(self):
        return self._tissue

    @tissue.setter
    def tissue(self, tissue):
        raise e.SetterError("Cannot set tissue, please construct a new "
                            "RNAAgeCalc object and use the tissue argument"
                            " to set tissue. ")

    @property
    def signature(self):
        return self._signature

    @signature.setter
    def signature(self, signature):
        raise e.SetterError("Cannot set signature, please construct a new "
                            "RNAAgeCalc object and use the signature argument"
                            " to set signature. ")

    def _processlength(self, rawcount, genelength):
        if genelength is None:
            location = os.path.dirname(os.path.realpath(__file__))
            lengthDB = pd.read_csv(os.path.join(location, "internal_data",
                                                "lengthDB.csv"), index_col=0)
            if self.idtype != "ensembl.gene":
                # convert the gene id to ensembl to match the gene id in gene
                # length database
                mg = mygene.MyGeneInfo()
                genes = list(rawcount.index)
                print("converting the gene id to ensembl to match the gene id "
                      "in gene length database. This may take some time.")
                temp = mg.querymany(genes, scopes='symbol',
                                    fields='ensembl.gene',
                                    species='human', returnall=True,
                                    as_dataframe=True)["out"]
                temp1 = temp.loc[~temp["ensembl.gene"].isna(), "ensembl.gene"]
                temp2 = temp.loc[~temp["ensembl"].isna(), "ensembl"]
                index_expand = []
                value_expand = []
                for index, value in temp2.iteritems():
                    index_expand.append([index]*len(value))
                    value_expand.append(list(map(lambda x: x["gene"], value)))
                temp3 = pd.Series(
                    list(itertools.chain.from_iterable(value_expand)),
                    index=list(itertools.chain.from_iterable(index_expand)))
                temp1 = temp1.append(temp3)

                temp1 = temp1[temp1.isin(lengthDB.index)]
                temp1 = temp1[~temp1.index.duplicated(keep="first")]
                temp1 = temp1.drop_duplicates(keep=False)

                genename = temp1[rawcount.index]
                genename[genename.isnull()] = "unknown"
                genelength = lengthDB.loc[genename]
            else:
                genelength = lengthDB.loc[rawcount.index]
        else:
            bool1 = isinstance(genelength, pd.DataFrame) or \
                    isinstance(genelength, pd.Series) or \
                    isinstance(genelength, list) or \
                    isinstance(genelength, np.ndarray)
            assert bool1, "genelength should be a pandas DataFrame, Series, "\
                          "numpy array or list."

            if isinstance(genelength, pd.DataFrame):
                assert genelength.shape[0] == rawcount.shape[0], \
                    "The number of rows in genelength should be the same as " \
                    "that of rawcount."
                if genelength.shape[1] > 1:
                    print("genelength contains more than 1 column, only the"
                          " 1st column will be used.")
                    genelength = pd.DataFrame(genelength.iloc[:, 0])
                assert genelength.applymap(np.isreal).all().bool()
            elif isinstance(genelength, pd.Series):
                assert len(genelength) == rawcount.shape[0], \
                    "The length of genelength should be the same as " \
                    "number of rows in rawcount."
                genelength = pd.DataFrame(genelength)
                assert genelength.applymap(np.isreal).all().bool()
            elif isinstance(genelength, list):
                assert all(isinstance(item, float) or isinstance(item, int)
                           for item in genelength)
                assert len(genelength) == rawcount.shape[0], \
                    "The length of genelength should be the same as " \
                    "number of rows in rawcount."
                genelength = pd.DataFrame(genelength)
            else:
                if genelength.shape[0] > 1:
                    print("numpy array has more than 1 dimension, only the "
                          "1st dimension will be used.")
                    genelength = genelength[0]
                assert np.isreal(genelength)
                genelength = pd.DataFrame(genelength).transpose()
        return genelength

    def _count2FPKM(self, rawcount, genelength):
        """Convert gene expression data from raw count to FPKM.

        This function converts gene expression data from raw count to FPKM.

        :param rawcount: a pandas DataFrame which contains gene expression
        count data.

        :param genelength: a pandas Series, DataFrame, numpy array, or list
        which contains gene length in bp. The size of genelength should be
        equal to the number of rows in exprdata. This argument is optional.

        If using exprtype="FPKM", genelength argument is ignored. If using
        exprtype="count", the raw count will be converted to FPKM. If
        genelength is provided, the function will convert raw count to FPKM
        according to the user-supplied gene length. Otherwise, gene length
        is obtained from the internal database.

        :return: a pandas DataFrame contains FPKM.
        """

        genelength = self._processlength(rawcount, genelength)
        assert (genelength.iloc[:, 0].dropna() > 0).all(), \
            "genelength cannot contain negative value(s) or 0."

        num_nas = genelength.iloc[:, 0].isna().sum()
        if num_nas > 0:
            print("Can't find gene length for {:.2f}% genes when converting "
                  "raw count to FPKM.".format(num_nas/genelength.shape[0]*100))

        countsum = rawcount.apply(sum)
        bg = pd.DataFrame(columns=rawcount.columns, index=rawcount.index)
        bg.loc[:, :] = countsum.values

        wid = pd.DataFrame(columns=rawcount.columns, index=rawcount.index)
        wid.loc[:, :] = genelength.values

        return rawcount / (wid/1000) / (bg/1e6)

[docs]    def predict_age(self, exprdata, genelength=None, chronage=None):
        """Calculate RNA age.

        This function calculates RNA age based on pre-trained predictors.

        :param exprdata: a pandas DataFrame which contains gene expression data
        with each row represents a gene and each column represents a sample.
        Use the argument "exprtype" to specify raw count or FPKM. The index of
        "exprdata" should be gene ids and columns names of "exprdata" should
        be sample ids.

        :param genelength: a pandas Series, DataFrame, numpy array, or list
        which contains gene length in bp. The size of genelength should be
        equal to the number of rows in exprdata. This argument is optional.
        If using exprtype="FPKM", genelength argument is ignored. If using
        exprtype="count", the raw count will be converted to FPKM. If
        genelength is provided, the function will convert raw count to FPKM
        according to the user-supplied gene length. Otherwise, gene length
        is obtained from the internal database.

        :param chronage: a pandas DataFrame which contains the chronological
        age of each sample. This argument is optional. If provided, it should
        be a DataFrame with 1st column sample id and 2nd column chronological
        age. The sample order in chronage doesn't have to be in the same order
        as in exprdata. However, the samples in chronage and exprdata should
        be the same. If some samples' chronological age are not available,
        users are expected to set the chronological age in chronage to NaN.
        If chronage contains more than 2 columns, only the first 2 columns
        will be considered. If this argument is not provided, the age
        acceleration residual will not be calculated. See package tutorial
        for the definition of age acceleration residual.

        :return: a pandas DataFrame contains RNA age.

        """

        # check input:
        assert isinstance(exprdata, pd.DataFrame), \
            "exprdata should be a pandas DataFrame."
        assert exprdata.applymap(np.isreal).all().all(),\
            "Only numeric values are allowed in the exprdata DataFrame."
        assert list(exprdata.index) != list(range(exprdata.shape[0])), \
            "The index of exprdata should be gene ids."
        assert list(exprdata.columns) != list(range(exprdata.shape[1])), \
            "The column names of exprdata should be sample ids."
        assert ~np.any(exprdata.index.duplicated()), \
            "Duplicated gene names found in exprdata."
        assert (exprdata >= 0).all().all(), \
            "Gene expression data cannot contain negative value(s)."

        if chronage is not None:
            assert isinstance(chronage, pd.DataFrame), \
                "chronage should be a pandas DataFrame."
            if(chronage.shape[1] > 2):
                print("More than 2 columns are provided in chronage. "
                      "Only the first 2 columns will be used.")
            # assert ~chronage.applymap(np.isreal).all()[0], \
            #    "The 1st column in chronage should be sample ids."
            assert chronage.applymap(np.isreal).all()[1], \
                "The 2nd column in chronage should be chronological age."
            assert ~any(chronage.iloc[:, 0].duplicated()), \
                "chronage contains duplicated sample ids."
            assert set(chronage.iloc[:, 0].astype(str)) == \
                set(exprdata.columns), \
                "Samples in chronage and exprdata should be the same."
            assert ~np.any(chronage.iloc[:, 1] < 0), \
                "Chronological age contains negative value(s)."

        if self._exprtype == "count":
            exprdata = self._count2FPKM(exprdata, genelength)

        if self.idtype != "symbol":
            mg = mygene.MyGeneInfo()
            genes = list(exprdata.index)
            temp = mg.querymany(genes, scopes=self.idtype, fields='symbol',
                                species='human', returnall=True,
                                as_dataframe=True)["out"]
            temp = temp.loc[~temp["symbol"].isna(), "symbol"]
            temp = temp[~temp.index.duplicated(keep="first")]
            temp = temp.drop_duplicates(keep=False)
            genesymbol = temp[exprdata.index]
            genesymbol[genesymbol.isna()] = "unknown"
            exprdata.index = genesymbol

        location = os.path.dirname(os.path.realpath(__file__))
        if self.stype == "all":
            tempPath = os.path.join(location, "internal_data", "all",
                                    "coef_{}_{}.csv".format(self._tissue,
                                                            self._signature))
        else:
            tempPath = os.path.join(location, "internal_data", "Caucasian",
                                    "coef_{}_{}.csv".format(self._tissue,
                                                            self._signature))

        sig_internal = pd.read_csv(tempPath, index_col=0)
        genes_required = sig_internal.index[1:]
        sig_in_expr = genes_required.isin(exprdata.index)
        # full NA row
        if np.sum(~sig_in_expr) != 0:
            print("{:.2f}% genes in the gene signature are not included in "
                  "the supplied gene expression."
                  .format(np.sum(~sig_in_expr)/len(genes_required)*100))

            # impute the gene expression in the log scale
            tempmat = pd.DataFrame(columns=exprdata.columns,
                                   index=genes_required[~sig_in_expr])

            exprdata_withNA = pd.concat([exprdata, tempmat], axis=0)
            exprdata_log = np.log2(exprdata_withNA.apply(pd.to_numeric) + 1)
            ind1 = exprdata_log.isna().all(axis=1)
            ind2 = ~exprdata_log.isna().any(axis=1)
            exprdata_log.loc[(ind1 | ind2), :] = \
                exprdata_log.loc[(ind1 | ind2), :].fillna(exprdata_log.mean())
        else:
            exprdata_log = np.log2(exprdata.apply(pd.to_numeric) + 1)

        # check partial NA
        if ~exprdata_log.notnull().all().all():
            # impute the gene expression in the log scale
            imputer = KNNImputer(n_neighbors=min(10, exprdata_log.shape[1]),
                                 row_max_missing=1, col_max_missing=1)
            X_imputed = imputer.fit_transform(exprdata_log.transpose())
            exprdata_log_impute = pd.DataFrame(X_imputed).transpose()
            exprdata_log_impute.index = exprdata_log.index
            exprdata_sub = exprdata_log_impute.loc[genes_required, :]
        else:
            exprdata_sub = exprdata_log.loc[genes_required, :]

        RNAAge = exprdata_sub.apply(
            lambda x: np.sum(x.multiply(sig_internal.iloc[1:, 0])) +
            sig_internal.iloc[0, 0])
        res = pd.DataFrame(index=exprdata.columns)
        res["RNAAge"] = list(RNAAge)

        if chronage is not None:
            chronage.index = chronage.iloc[:, 0]
            res["ChronAge"] = chronage.loc[res.index].iloc[:, 1]
            # if sample size is too small, age acceleration residual
            # cannot be calculated
            if res.dropna().shape[0] > 30:
                Y = res["RNAAge"]
                X = res["ChronAge"]
                X = sm.add_constant(X)
                model = sm.OLS(Y, X).fit()
                res["AgeAccelResid"] = model.resid

        return res

    @staticmethod
    def _tissue_set():
        return set(["adipose_tissue", "adrenal_gland", "blood", "blood_vessel",
                    "brain", "breast", "colon", "esophagus", "heart", "liver",
                    "lung", "muscle", "nerve", "ovary", "pancreas",
                    "pituitary", "prostate", "salivary_gland", "skin",
                    "small_intestine", "spleen", "stomach", "testis",
                    "thyroid", "uterus", "vagina", "all_tissues"])

    @staticmethod
    def _sig_set():
        return set(["DESeq", "DESeq2", "Pearson", "Dev", "deMagalhaes",
                    "GenAge", "GTExAge", "Peters", "all"])

    def __repr__(self):
        return """RNAAgeCalc(tissue=%r, exprtype=%r, idtype=%r, stype=%r,
signature=%r)""" % (self.tissue, self.exprtype, self.idtype,
                    self.stype, self.signature)

    def __str__(self):
        return """RNAAgeCalc
tissue: {}
exprtype: {}
idtype: {}
stype: {}
signature: {}""".format(self.tissue, self.exprtype, self.idtype,
                        self.stype, self.signature)
Source code for racpy.core

racpy

Navigation

Related Topics