Source code for etfl.data.ecoli

# -*- coding: utf-8 -*-

from os.path import join as pjoin
import os

import cobra

import pandas as pd

import numpy as np

from pytfa.io import load_thermoDB,                    \
                            read_lexicon, annotate_from_lexicon,            \
                            read_compartment_data, apply_compartment_data

from .ecoli_utils import infer_enzyme_from_gpr

from ..core.expression import is_me_compatible
from ..core import Enzyme, Ribosome, RNAPolymerase, ThermoMEModel, MEModel
from ..core.rna import mRNA

from collections import defaultdict
from numbers import Number

import re

[docs]def clean_string(s): s = s.replace('-','_') # Remove invalid characters s = re.sub('[^0-9a-zA-Z_]', '', s) # Remove leading characters until we find a letter or underscore s = re.sub('^[^a-zA-Z_]+', '', s) return s
[docs]file_dir = os.path.dirname(os.path.abspath(__file__))
[docs]data_dir = pjoin(file_dir,'../../organism_data/info_ecoli')
######################### ### BASE MODEL ### #########################
[docs]def get_model(solver): vanilla_model = cobra.io.load_json_model('iJO1366_with_xrefs.json') vanilla_model.slim_optimize() vanilla_model.solver = solver vanilla_model.slim_optimize() def sanitize_varnames(model): for met in model.metabolites: if met.id[0].isdigit(): met.id = '_' + met.id for rxn in model.reactions: if rxn.id[0].isdigit(): rxn.id = '_' + rxn.id model.repair() return model # Add cystein -> selenocystein transformation for convenience selcys = cobra.Metabolite(id='selcys__L_c', compartment='c', formula='C3H7NO2Se') selcys_rxn = cobra.Reaction(id='PSEUDO_selenocystein_synthase', name='PSEUDO Selenocystein_Synthase') selcys_rxn.add_metabolites( {vanilla_model.metabolites.cys__L_c: -1, selcys: +1}) vanilla_model.add_reactions([selcys_rxn]) sanitize_varnames(vanilla_model) vanilla_model.slim_optimize() return vanilla_model
# ------------------------------------------------------------ # Thermo # ------------------------------------------------------------
[docs]def get_thermo_data(): # Load Thermo data thermo_data = load_thermoDB(pjoin(data_dir,'thermo_data','thermo_data.thermodb')) lexicon = read_lexicon(pjoin(data_dir,'thermo_data','iJO1366_lexicon.csv')) # lexicon = curate_lexicon(read_lexicon('thermo_data/iJO1366_lexicon.csv')) compartment_data = read_compartment_data(pjoin(data_dir,'thermo_data', 'iJO1366_compartment_data.json')) def curate_lexicon(lexicon): ix = pd.Series(lexicon.index) ix = ix.apply(lambda s: str.replace(s,'-','__')) ix = ix.apply(lambda s: '_'+s if s[0].isdigit() else s) lexicon.index = ix return lexicon lexicon = curate_lexicon(lexicon) return thermo_data, lexicon, compartment_data
#------------------------------------------------------------ # Data #------------------------7.54-------------------------------- # Essentials
[docs]def get_essentials(): return dict(atp='atp_c', adp='adp_c', amp='amp_c', gtp='gtp_c', gdp='gdp_c', pi='pi_c', ppi='ppi_c', h2o='h2o_c', h='h_c')
# Growth-related abundances
[docs]def get_neidhardt_data(): neidhardt_data = pd.read_excel(pjoin(data_dir,'neidhardt_tab2.xlsx'), skiprows=range(0,6), skipfooter=22) mu_cols = ['mu=0.6','mu=1.0','mu=1.5','mu=2.0','mu=2.5'] neidhardt_data.columns = ['parameter','symbol','units'] + mu_cols \ + ['observed_parameters','footnote'] neidhardt_data.set_index('symbol', inplace=True) Pc = neidhardt_data.loc['Pc (μg)'][mu_cols] # μg/10^9 cells Rc = neidhardt_data.loc['Rc (μg)'][mu_cols] # μg/10^9 cells Dc = neidhardt_data.loc['Gc (μg)'][mu_cols] # μg/10^9 cells Mc = neidhardt_data.loc['Mc (μg)'][mu_cols] # μg dry weight/10^9 cells neidhardt_prel = (Pc/Mc).astype(float) neidhardt_rrel = (Rc/Mc).astype(float) neidhardt_drel = (Dc/Mc).astype(float) neidhardt_mu = pd.Series(Pc.index.str.replace('mu=','')).astype(float) return neidhardt_mu, neidhardt_rrel, neidhardt_prel, neidhardt_drel
#------------------------------------------------------------ # Expression #------------------------------------------------------------ # Data # Sequences from KEGG
[docs]nt_sequences = pd.read_csv(pjoin(data_dir,'iJO1366_nt_seq_kegg.csv'), index_col = 0, header = None)[1]
[docs]def get_nt_sequences(): return nt_sequences
# iJO kcat info from: # Davidi, Dan, et al. # "Global characterization of in vivo enzyme catalytic rates and their correspondence to in vitro kcat measurements." # Proceedings of the National Academy of Sciences 113.12 (2016): 3401-3406.
[docs]kcat_info_milo = pd.read_excel(pjoin(data_dir,'pnas.1514240113.sd01.xlsx'), sheet_name='kcat 1s', header=2,
)
[docs]kmax_info_milo = pd.read_excel(pjoin(data_dir,'pnas.1514240113.sd01.xlsx'), sheet_name='kmax 1s', header=2,
)
[docs]kcat_info_aggregated = pd.read_csv(pjoin(data_dir,'aggregated_kcats.csv'), index_col = 0)
[docs]ec_info_ecocyc = pd.read_csv(pjoin(data_dir,'complex2ec.csv'), index_col = 0)
[docs]composition_info_ecocyc = pd.read_csv(pjoin(data_dir,'complex2genes.csv'), index_col = 0)
[docs]reaction2complexes_info_obrien = pd.read_excel( pjoin(data_dir, 'obrien2013_SI_tab10.xlsx'), index_col=0, usecols=[0, 1])
[docs]complexes2peptides_info_obrien = pd.read_excel( pjoin(data_dir, 'obrien2013_SI_tab1.xlsx'), index_col=0, usecols=[0, 1])
[docs]reaction2complexes_info_lloyd = pd.read_csv( pjoin(data_dir, 'lloyd_2018_enzyme_reaction_association.txt'), delimiter = '\t', index_col = 0, header=None)
reaction2complexes_info_lloyd.columns = ['Enzymes']
[docs]complexes2peptides_info_lloyd = pd.read_csv( pjoin(data_dir, 'lloyd_2018_protein_complexes.txt'), delimiter = '\t', index_col = 0, usecols=[0,2], header = None)
[docs]complexes2peptides_info_lloyd.columns = ['Gene composition']
[docs]gene_names = pd.read_csv(pjoin(data_dir,'gene2bname.txt'), delimiter='\t', index_col=0)
# mRNA degardation rates from # Bernstein et al. (2002) Proc. Natl. Acad. Sci. USA, 10.1073/pnas.112318199 # "Global analysis of mRNA decay and abundance in Escherichia coli at single-gene resolution using two-color fluorescent DNA microarrays"
[docs]bernstein_ecoli_deg_rates = pd.read_excel( pjoin(data_dir,'bernstein_2002_mrna_deg.xls'), skiprows=range(8), index_col=0)
# Bionumber # http://bionumbers.hms.harvard.edu/bionumber.aspx?id=100528 # ID 100528 # Property GC content of E. coli K12 chromosome # Organism Bacteria Escherichia coli # Value 50.8 # Units % # Reference Escherichia coli K12 (Escherichia coli K12 substr. MG1655) Genome Browser Gateway link scroll down to slightly above bottom of page # Comments GC content value given 50.79%. Length of chromosome 4,639,675bp. Retrieved March 29th 2017 # Assumed GC ratio of 0.5078
[docs]gc_ratio = 0.5078 # %
[docs]chromosome_len = 4639675 # bp
[docs]def get_ecoli_gen_stats(): return chromosome_len, gc_ratio
[docs]def get_ratios(): # Bionumbers # ID 104876 # Property Amino acid composition of the proteins from E. coli cell supernatant # http://bionumbers.hms.harvard.edu/bionumber.aspx?id=104876 # Unit: per 100 moles aas aa_ratios = { 'K':9.01/100, # lys__L_c 'H':1.91/100, # his__L_c 'R':7.30/100, # arg__L_c 'D':8.30/100, # asp__L_c 'E':10.08/100, # glu__L_c 'G':8.18/100, # gly_c 'A':10.98/100, # ala__L_c 'V':9.63/100, # val__L_c 'L':7.40/100, # leu__L_c 'I':5.51/100, # ile__L_c 'T':5.22/100, # thr__L_c 'S':4.38/100, # ser__L_c 'P':3.67/100, # pro__L_c 'Y':1.78/100, # tyr__L_c 'F':3.03/100, # phe__L_c 'W':0.69/100, # trp__L_c 'M':2.40/100, # met__L_c 'C':0.53/100, # cys__L_c } nt_ratios = {'u':0.5*(1-gc_ratio), 'a':0.5*(1-gc_ratio), 'g':0.5*gc_ratio, 'c':0.5*gc_ratio, } return nt_ratios,aa_ratios
[docs]def get_monomers_dict(): aa_dict = {'A': 'ala__L_c', 'R': 'arg__L_c', 'N': 'asn__L_c', 'D': 'asp__L_c', # 'B':'asx', 'C': 'cys__L_c', 'E': 'glu__L_c', 'Q': 'gln__L_c', # 'Z':'glx', 'G': 'gly_c', 'H': 'his__L_c', 'I': 'ile__L_c', 'L': 'leu__L_c', 'K': 'lys__L_c', 'M': 'met__L_c', 'F': 'phe__L_c', 'P': 'pro__L_c', 'S': 'ser__L_c', 'T': 'thr__L_c', 'U': 'selcys__L_c', 'W': 'trp__L_c', 'Y': 'tyr__L_c', 'V': 'val__L_c', } rna_nucleotides = { 'u': 'utp_c', 'g': 'gtp_c', 'a': 'atp_c', 'c': 'ctp_c'} rna_nucleotides_mp = { 'u': 'ump_c', 'g': 'gmp_c', 'a': 'amp_c', 'c': 'cmp_c'} dna_nucleotides = { 't': 'dttp_c', 'g': 'dgtp_c', 'a': 'datp_c', 'c': 'dctp_c'} return aa_dict, rna_nucleotides, rna_nucleotides_mp, dna_nucleotides
[docs]def remove_from_biomass_equation(model, nt_dict, aa_dict, essentials_dict): # According to discussions, should only remove GAM mets_to_rm = dict() old_total_stoich = abs(sum([x for x in model.growth_reaction.metabolites.values() if x<0])) expression_mets = list(nt_dict.values()) + list(aa_dict.values()) # For ATP correction (see below) n_aa = 0 for m,stoich in model.growth_reaction.metabolites.items(): if m.id in expression_mets: mets_to_rm[m] = -1*stoich if m.id in aa_dict.values(): n_aa += stoich model.growth_reaction.add_metabolites(mets_to_rm) # We need to add back the ATP from the GAM that we just removed but should # still be taken into account. Indeed, nADP =/= nATP in the original # biomass reaction: # -54.119975 atp_c + .... --> 53.95 adp_c atp = model.metabolites.get_by_id(essentials_dict['atp']) adp = model.metabolites.get_by_id(essentials_dict['adp']) amp = model.metabolites.get_by_id(essentials_dict['amp']) pi = model.metabolites.get_by_id(essentials_dict['pi']) ppi = model.metabolites.get_by_id(essentials_dict['ppi']) h2o = model.metabolites.get_by_id(essentials_dict['h2o']) h = model.metabolites.get_by_id(essentials_dict['h']) atp_recovery = model.growth_reaction.metabolites[adp] # Omid's calculations: # There is also ATP used for generating the GTP for the synthesis of peptides # 2 GTP per aminoacid attached, 1 ATP per GTP # We need to compute how much aminoacid are consumed # We get it from the removal of the aminoacid earlier in the function # n_aa is <= 0, we remove here the ATP used for peptide synthesis # We add back ATP at the ADP previous level # Remove 2 ATP per aa polymerised into a peptide # +1 for tRNA charging : AA + uncharged_tRNA + ATP + 2 H2O -> charged_tRNA + AMP + PPI + 2H+ # model.growth_reaction.add_metabolites({atp: -1*atp_recovery - 3*n_aa , # h2o: -4*n_aa, # h2o consumed less (n_aa is negative), # adp: 2 * n_aa, # amp: 1 * n_aa, # pi: 2*n_aa, # ppi: 1*n_aa, # h: 4*n_aa, # }) model.growth_reaction.add_metabolites({atp: -1*atp_recovery - 2*n_aa , h2o: -2*n_aa, # h2o consumed less (n_aa is negative), adp: 2 * n_aa, amp: 0 * n_aa, pi: 2*n_aa, ppi: 0*n_aa, h: 2*n_aa,
}) # model.growth_reaction.add_metabolites({atp: -1*atp_recovery - 3*n_aa , # h2o: -2*n_aa, # h2o consumed less (n_aa is negative), # adp: 2 * n_aa, # amp: 0 * n_aa, # pi: 2*n_aa, # ppi: 0*n_aa, # h: 2*n_aa, # }) # Prot degradation # BNID 111930 # Moran MA et al., Sizing up metatranscriptomics. ISME J. 2013 Feb7(2):237-43. # A typical bacterial protein half-life is ~20 h # ------- # tau = 1/kdeg = t_0.5 /ln(2) # kdeg = ln(2)/t_0.5
[docs]kdeg_enz = np.log(2)/20 # [h-1]
# From : # http://book.bionumbers.org/how-fast-do-rnas-and-proteins-degrade/ # Figure 1: Measured half lives of mRNAs in E. coli, budding yeast and mouse NIH3T3 fibroblasts. # (A, adapted from J. A. Bernstein et al., Proc. Natl Acad. Sci. USA 99:9697, 2002; # B, adapted from Y. Wang et al., Proc. Natl Acad. Sci. USA 99:5860, 2002; # C. adapted from B. Schwanhausser, Nature, 473:337, 2013). # ------- # Mean half life of mrna is 5 minutes in ecoli # tau = 1/kdeg = t_0.5 /ln(2) # kdeg = ln(2)/t_0.5
[docs]kdeg_mrna = 60*np.log(2)/5
# Average mrna length from Bionumber 100023 # http://bionumbers.hms.harvard.edu/bionumber.aspx?&id=100023&ver=3 # mrna_length_avg = 370 # nm
[docs]mrna_length_avg = 1000
# Average peptide length
[docs]peptide_length_avg = int(np.round(mrna_length_avg/3))
[docs]def get_mrna_metrics(): return kdeg_mrna, mrna_length_avg
[docs]def get_enz_metrics(): return kdeg_enz, peptide_length_avg
# Generate a coupling dict
[docs]def is_gpr(s): return bool(s) and s != '[]'
# Milo kcats #############
[docs]def get_homomer_coupling_dict(model, mode = 'kcat'): if mode == 'kcat': k_info = kcat_info_milo k_column = 'kcat per active site [1/s]' n_column = 'catalytic sites per complex' elif mode == 'kmax': k_info = kmax_info_milo k_column = 'kmax per polypeptide chain [s-1]' n_column = 'polypeptides per complex' elif isinstance(mode, Number): k_info = kmax_info_milo k_column = 'kmax per polypeptide chain [s-1]' n_column = 'polypeptides per complex' else: raise Exception("Mode {} not understood. Should be 'kcat' or 'kmax' " "or a number") coupling_dict = dict() for x in model.reactions: composition, kcat_bwd, kcat_fwd = get_rate_constant(x, k_info, k_column, n_column) if isinstance(mode, Number): # It is a number kcat_fwd = kcat_bwd = mode if kcat_fwd == 0 and kcat_bwd == 0: continue if kcat_bwd == 0: kcat_bwd = kcat_fwd if kcat_fwd == 0: kcat_fwd = kcat_bwd if not composition: # WE cannot make the enzyme. To infer a composition, use # infer_missing_enzymes in get_coupling_dict continue #FIXME several polypeptides per complex ?? new_enzyme = Enzyme(x.id, kcat_fwd=kcat_fwd, kcat_bwd=kcat_bwd, kdeg=kdeg_enz, composition = composition) coupling_dict[x.id] = [new_enzyme] return coupling_dict
[docs]def get_rate_constant(reaction, k_info, k_column, n_column): k_data = k_info[k_info['reaction (model name)'] == reaction.id] k_data_reverse = k_info[k_info['reaction (model name)'] == reaction.id + '_reverse'] data = kcat_info_milo[kcat_info_milo['reaction (model name)'] == reaction.id] data_reverse = kcat_info_milo[kcat_info_milo['reaction (model name)'] == reaction.id + '_reverse'] kcat_fwd = 0 kcat_bwd = 0 composition = {} if len(k_data) > 0: candidate_k = k_data[k_column].iloc[0] n_peptides = data['polypeptides per complex'].iloc[0] \ if len(data) > 0 else 1 n_sites = data[n_column].iloc[0] \ if len(data) > 0 else 1 kcat_fwd = candidate_k \ * n_sites \ * 3600 # s/h composition = {k_data['bnumber'].iloc[0]: n_peptides} if len(k_data_reverse) > 0: candidate_k = k_data_reverse[k_column].iloc[0] n_peptides = data_reverse['polypeptides per complex'].iloc[0] \ if len(data_reverse) > 0 else 1 n_sites = data_reverse[n_column].iloc[0] \ if len(data_reverse) > 0 else 1 kcat_bwd = candidate_k \ * n_sites \ * 3600 # s/h composition = { k_data_reverse['bnumber'].iloc[0]: n_peptides} \ if not composition else composition return composition, kcat_bwd, kcat_fwd
# Aggregated kcats ##################
[docs]def ec2ecocyc(ec_number): if not isinstance(ec_number, list): return ec_info_ecocyc[ec_info_ecocyc['ec'] == ec_number] else: return ec_info_ecocyc[ec_info_ecocyc['ec'].isin(ec_number)]
[docs]def score_against_genes(putative_genes, reaction_genes): score = 0 putative_genes_list = putative_genes.split('" // "') reaction_gene_list = [x.name for x in reaction_genes] s1 = len(set(putative_genes_list).intersection(reaction_gene_list)) s2 = len(set(putative_genes_list).difference (reaction_gene_list)) s3 = len(set(reaction_gene_list) .difference (putative_genes_list)) score = 2*s1-s2-s3 # print(putative_genes_list, reaction_gene_list, score) return score
[docs]def match_ec_genes_ecocyc(ecocyc, genes, threshold=0.5): this_data = composition_info_ecocyc[composition_info_ecocyc['complex'].isin(ecocyc)] scores = this_data['putative_genes'].apply(score_against_genes, args=[genes]) selectable = this_data[scores>len(genes)*threshold] if len(selectable) == 0: return None, scores else: return selectable[scores == scores.max()].iloc[0], scores
[docs]def ecocyc2composition(ecocyc): ecocyc_comp = composition_info_ecocyc[composition_info_ecocyc['complex'] == ecocyc] # We do a left join to get the bnumbers that are used in the model composition_data = ecocyc_comp.merge(gene_names, right_index=True, how='left', left_on='gene') composition = defaultdict(int) for e,row in ecocyc_comp.iterrows(): this_gene = row['gene'] try: this_b_number = gene_names.loc[this_gene]['b#'] composition[this_b_number] += composition_data['coeffs'].iloc[0] except: # ecoli.logger.warning('Could not find gene associated to {}' # .format(row['obj_ids'])) # ecoli.logger.info(ecocyc_comp) pass return composition
[docs]comp_regex = re.compile(r'(b[0-9]{4})\((\d?)\)')
[docs]def complex2composition(complex_name): # Silence modifications if '_mod_' in complex_name: complex_name = complex_name[0:complex_name.index('_mod_')] try: composition_string = complexes2peptides_info_lloyd.loc[complex_name,'Gene composition'] except KeyError: composition_string = complexes2peptides_info_obrien.loc[complex_name,'Gene composition'] composition_dict = {} groups = comp_regex.findall(composition_string) for peptide, stoich in groups: if stoich == '': stoich = 1 else: stoich = int(stoich) composition_dict[peptide] = stoich return composition_dict
[docs]def ec2kcat(ec_number): try: return kcat_info_aggregated['kcat'].loc[ec_number].max() * 3600 # s/h except KeyError: return None
[docs]def check_id_in_reaction_list(the_id, df): if the_id in df.index: return the_id elif the_id[0] == '_' and the_id[1:] in df.index: return the_id[1:] elif the_id + '1' in df.index: return the_id + '1' else: return ''
[docs]def get_aggregated_coupling_dict(model, coupling_dict = dict()): aggregated_coupling_dict = defaultdict(list) for x in model.reactions: # reactions starting with a number have been sanitized to start with '_' if x.id in coupling_dict: # We already have info continue if 'ec_numbers' not in x.notes or x.notes['ec_numbers'] == ['nan']: # There is nothing we can do continue reaction_ecs = x.notes['ec_numbers'] lloyd_id = check_id_in_reaction_list(x.id, reaction2complexes_info_lloyd) obrien_id = check_id_in_reaction_list(x.id, reaction2complexes_info_obrien) if lloyd_id: complex_names = reaction2complexes_info_lloyd.loc[lloyd_id,'Enzymes']\ .split(' OR ') elif obrien_id: complex_names = reaction2complexes_info_obrien.loc[obrien_id,'Enzymes']\ .split(' OR ') else: continue for e,this_complex_name in enumerate(complex_names): # Start with this: composition = complex2composition(this_complex_name) if not composition: # Skip this one continue this_ec = x.notes['ec_numbers'][0] kcat = ec2kcat(this_ec) if kcat is None: continue cleaned_cplx_name = clean_string(this_complex_name) new_enzyme = Enzyme('{}_{}_{}'.format(x.id,cleaned_cplx_name,e), name='{}_{}: {}'.format(x.id, e, this_complex_name), kcat=kcat, kdeg=kdeg_enz, composition = composition) new_enzyme.notes['EC'] = this_ec aggregated_coupling_dict[x.id].append(new_enzyme) return aggregated_coupling_dict
[docs]def get_lloyd_keffs(): import json with open(pjoin(data_dir, 'lloyd_2018_keffs.json'), 'r') as fid: keffs = json.load(fid) new_keffs = dict() for key in keffs.keys(): new_key = key.replace('keff_','') new_key = new_key.replace('_DASH_','-') new_keffs[new_key] = keffs[key] * 3600 # s/h return new_keffs
[docs]def get_keffs_from_complex_name(keffs, name): try: kcat_fwd = keffs[name + '_forward_priming_keff'] except KeyError: kcat_fwd = None try: kcat_bwd = keffs[name + '_reverse_priming_keff'] except KeyError: kcat_bwd = None return kcat_fwd,kcat_bwd
[docs]def get_lloyd_coupling_dict(model, select=None): if select is None: select = model.reactions.list_attr('id') aggregated_coupling_dict = defaultdict(list) keffs = get_lloyd_keffs() for xid in select: x = model.reactions.get_by_id(xid) lloyd_id = check_id_in_reaction_list(x.id, reaction2complexes_info_lloyd) obrien_id = check_id_in_reaction_list(x.id, reaction2complexes_info_obrien) if lloyd_id: complex_names = reaction2complexes_info_lloyd.loc[lloyd_id, 'Enzymes'] \ .split(' OR ') elif obrien_id: complex_names = reaction2complexes_info_obrien.loc[obrien_id, 'Enzymes'] \ .split(' OR ') else: continue for e, this_complex_name in enumerate(complex_names): # Start with this: composition = complex2composition(this_complex_name) if not composition: # Skip this one continue cleaned_cplx_name = clean_string(this_complex_name) keff_name = '{}_{}'.format(x.id, this_complex_name) enz_name = '{}_{}'.format(x.id, cleaned_cplx_name) kcat_fwd, kcat_bwd = get_keffs_from_complex_name(keffs, keff_name) if kcat_fwd is None and kcat_bwd is None: continue elif kcat_fwd is None: kcat_fwd = kcat_bwd elif kcat_bwd is None: kcat_bwd = kcat_fwd new_enzyme = Enzyme(enz_name, name='{}_{}: {}'.format(x.id, e, this_complex_name), kcat_fwd=kcat_fwd, kcat_bwd=kcat_bwd, # kcat_fwd=5*kcat_fwd, # kcat_bwd=5*kcat_bwd, kdeg=kdeg_enz, composition=composition) aggregated_coupling_dict[x.id].append(new_enzyme) return aggregated_coupling_dict
[docs]def get_coupling_dict(model, mode, atps_name = None, infer_missing_enz=False): homomer_coupling_dict = get_homomer_coupling_dict(model, mode=mode) aggregated_coupling_dict = get_aggregated_coupling_dict(model, homomer_coupling_dict) # lloyd_dict = get_lloyd_coupling_dict(model) coupling_dict = dict() # Most important last # coupling_dict.update(lloyd_dict) coupling_dict.update(aggregated_coupling_dict) coupling_dict.update(homomer_coupling_dict) if infer_missing_enz: inferred_enz = dict() if isinstance(mode, Number): kcat = mode else: kcat = get_average_kcat() for r in model.reactions: if r.id in coupling_dict: continue if r.id not in coupling_dict \ and is_me_compatible(r): inferred_enz[r.id] = infer_enzyme_from_gpr(r, default_kcat=kcat, default_kdeg=kdeg_enz) coupling_dict.update(inferred_enz) # ATP Synthase bypasses numeric kcat if atps_name is not None: atps = get_atp_synthase_coupling(atps_name) coupling_dict.update(atps) # coupling_dict.update(get_transporters_coupling()) return coupling_dict
[docs]def get_average_kcat(): # return np.median(list(get_lloyd_keffs().values())) return np.mean(list(get_lloyd_keffs().values()))
[docs]def get_atp_synthase_coupling(atps_name): """ ATP synthesis rate of F1F0 ATP synthase Range at room temperature ∼0.060-0.10 μmol/min/mg of membrane protein : at 37°C 0.20 μmol/min/mg of membrane protein Organism Bacteria Escherichia coli Reference Tomashek JJ, Glagoleva OB, Brusilow WS. The Escherichia coli F1F0 ATP synthase displays biphasic synthesis kinetics. J Biol Chem. 2004 Feb 6 279(6):4465-70 DOI: 10.1074/jbc.M310826200 p.4467 right column bottom paragraphPubMed ID14602713 Primary Source [18] Etzold C, Deckers-Hebestreit G, Altendorf K. Turnover number of Escherichia coli F0F1 ATP synthase for ATP synthesis in membrane vesicles. Eur J Biochem. 1997 Jan 15 243(1-2):336-43.PubMed ID9030757 Method Luciferase assay Comments P.4467 right column bottom paragraph: "Previously, Etzold et al. (primary source) used the luciferase assay to measure the turnover number of the ATP synthase during synthesis by membrane vesicles of E. coli. They measured ATP synthesis rates of ∼0.060-0.10 μmol/min/mg of membrane protein at room temperature and 0.20 μmol/min/mg of membrane protein at 37 °C." Entered by Uri M ID 115175 :return: """ # umol/(mg.min) * min/h * mmol/umol * mg/g # kcat = 0.08 * 60 * 1 kcat = 232 * 3600 * 1 composition = { 'b3733':1, 'b3738':1, 'b3732':3, 'b3736':2, 'b3731':1, 'b3735':1, 'b3737':10, 'b3734':3, } atp_synthase = Enzyme(atps_name, name='ATP Synthase', kcat_fwd=kcat, kcat_bwd=kcat, kdeg=kdeg_enz, composition=composition) return {atps_name:[atp_synthase]}
[docs]def get_dna_polymerase(dna_pol_name='DNAPol3'): """ https://en.wikipedia.org/wiki/DNA_polymerase_III_holoenzyme The replisome is composed of the following: 2 DNA Pol III enzymes, each comprising α, ε and θ subunits. (It has been proven that there is a third copy of Pol III at the replisome.[1]) the α subunit (encoded by the dnaE gene) has the polymerase activity. the ε subunit (dnaQ) has 3'→5' exonuclease activity. the θ subunit (holE) stimulates the ε subunit's proofreading. 2 β units (dnaN) which act as sliding DNA clamps, they keep the polymerase bound to the DNA. 2 τ units (dnaX) which act to dimerize two of the core enzymes (α, ε, and θ subunits). 1 γ unit (also dnaX) which acts as a clamp loader for the lagging strand Okazaki fragments, helping the two β subunits to form a unit and bind to DNA. The γ unit is made up of 5 γ subunits which include 3 γ subunits, 1 δ subunit (holA), and 1 δ' subunit (holB). The δ is involved in copying of the lagging strand. Χ (holC) and Ψ (holD) which form a 1:1 complex and bind to γ or τ. X can also mediate the switch from RNA primer to DNA.[2] :return: """ # DNA POLYMERASE III HOLOENZYME: Structure and Function of a Chromosomal Replicating Machine # Annual Review of Biochemistry # Vol. 64:171-200 (Volume publication date July 1995) # https://doi.org/10.1146/annurev.bi.64.070195.001131 ktrans = 1000 * 3600 composition = { # 2 DNA Pol III Enzymes 'b0184':2, #dnaE 'b0215':2, #dnaQ 'b1842':2, #holE # Beta sliding clamp units and Tau scaffold 'b3701': 2*2, # dnaN, Beta clamps 'b0470': 3, # dnaX, 2 tau units and 1 gamma unit 'b0640': 1, # holA, Delta subunit 'b1099': 1, # holB, Delta prime subunit 'b4259': 1, # holC, Xi subunit 'b4372': 1, # holD, Psi subunit } dna_polymerase = Enzyme(dna_pol_name, name='DNA Polymerase 3', kcat_fwd=ktrans, kcat_bwd=ktrans, kdeg=kdeg_enz, composition=composition) return dna_polymerase
[docs]def get_transporters_coupling(model, additional_enz): coupling_dict = get_lloyd_coupling_dict(model, select=additional_enz) # curated_refs = pd.read_csv(pjoin(data_dir,'transporters_kcats_missing.csv'), # header=0, skiprows=[1,], # Units row # index_col=0) # # # UNIPROT has non SI units: # # umol / (min.mgEnz) * g/mol *mol/umol * min/h * mg/g # curated_refs['etfl_kcat'] = \ # curated_refs['kcat'] * curated_refs['weight'] * 1e-6 * 60 * 1000 # # curated_dict = defaultdict(list) # # for rxn, row in curated_refs.iterrows(): # # if row['kcat'] == 0: # continue # # composition = {row['gene'] : 1} # enz_id = '{}_{}'.format(rxn,row['gene']) # enz_name = '{}, {} isoform'.format(rxn,row['gene']) # enz = Enzyme(enz_id, # name=enz_name, # kcat_fwd=row['etfl_kcat'] * 3600, # kcat_bwd=row['etfl_kcat'] * 3600, # kdeg=kdeg_enz, # composition=composition) # curated_dict[rxn].append(enz) # # coupling_dict.update(curated_dict) # print(curated_refs) return coupling_dict
[docs]def get_mrna_dict(model): mrna_dict = dict() # Generate a mRNA dict for x in nt_sequences.index: try: the_gene = model.genes.get_by_id(x) except KeyError: model.genes += [cobra.Gene(id=x)] the_gene = model.genes.get_by_id(x) # Try to get half life from Bernstein et al. try: t_half = bernstein_ecoli_deg_rates.loc[x.upper()]['medium, min.1'] #M9 medium # Mean half life of mrna is 5 minutes in ecoli # tau = t_0.5 /ln(2) # kdeg = 1/tau [min^-1] * [min/h] this_kdeg_mrna = (60 * np.log(2) / t_half) except KeyError: if x in rrna_genes: this_kdeg_mrna = kdeg_rib # Same as ribosome else: this_kdeg_mrna = kdeg_mrna # Average value of 5 mins if np.isnan(this_kdeg_mrna): this_kdeg_mrna = kdeg_mrna # Average value of 5 mins new_mrna = mRNA(x, kdeg = this_kdeg_mrna, gene_id = the_gene.id) mrna_dict[x] = new_mrna return mrna_dict
# Half life of a ribosome is 5 days
[docs]kdeg_rib = np.log(2)/(5*24)
[docs]rrna_genes = ['b3851', 'b3854', 'b3855']
[docs]def get_rib(): """ # Ribosome rRNA: b3851: K01977 16S ribosomal RNA | (RefSeq) rrsA; 16S ribosomal RNA of rrnA operon b3854: K01980 23S ribosomal RNA | (RefSeq) rrlA; 23S ribosomal RNA of rrnA operon b3855: K01985 5S ribosomal RNA | (RefSeq) rrfA; 5S ribosomal RNA of rrnA operon # rPeptides: See file ribosomal_proteins_ecoli.tsv :return: """ #[ ribosomes and RNAP ]# rpeptide_genes = pd.read_csv(pjoin(data_dir,'ribosomal_proteins_ecoli.tsv'), delimiter='\t', header=None)[0] rpeptide_genes = rpeptide_genes.str.split(':').apply(lambda x:x[1]) rib = Ribosome(id='rib', name='Ribosome', kribo=12 * 3600, kdeg=kdeg_rib, composition = rpeptide_genes, rrna=rrna_genes) return rib
# http://bionumbers.hms.harvard.edu/bionumber.aspx?&id=100060&ver=32 # Bionumber ID 100060 # Value 85 nt/sec # Source Bremer, H., Dennis, P. P. (1996) Modulation of chemical composition and other parameters of the cell by growth rate. # Neidhardt, et al. eds. Escherichia coli and Salmonella typhimurium: Cellular and Molecular Biology, 2nd ed. chapter 97 Table 3
[docs]ktrans = 85
[docs]def get_rnap(): """ # RNAP b3295: K03040 DNA-directed RNA polymerase subunit alpha [EC:2.7.7.6] | (RefSeq) rpoA; RNA polymerase, alpha subunit b3649: K03060 DNA-directed RNA polymerase subunit omega [EC:2.7.7.6] | (RefSeq) rpoZ; RNA polymerase, omega subunit b3987: K03043 DNA-directed RNA polymerase subunit beta [EC:2.7.7.6] | (RefSeq) rpoB; RNA polymerase, beta subunit b3988: K03046 DNA-directed RNA polymerase subunit beta' [EC:2.7.7.6] | (RefSeq) rpoC; RNA polymerase, beta prime subunit :return: """ rnap_genes = {'b3295':2,'b3649':1,'b3987':1,'b3988':1} rnap = RNAPolymerase(id='rnap', name='RNA Polymerase', ktrans = ktrans*3600, # kdeg = 0.2, kdeg = kdeg_enz, composition = rnap_genes) return rnap
[docs]def get_sigma_70(rnap): """ # RNAP b3067: rpoD :return: """ sigma_genes = {'b3067':1,} sigma70 = Enzyme(id='sigma70', name='Sigma Factor 70 kDa', kcat = 1, #Not defined kdeg = kdeg_enz, composition = sigma_genes) holo_comp = rnap.composition.copy() holo_comp.update(sigma_genes) holo_rnap = RNAPolymerase(id='holo' + rnap.id, name=str(rnap.name) + ' Holoenzyme', ktrans = rnap.ktrans, kdeg = kdeg_enz, composition = holo_comp) return sigma70, holo_rnap
[docs]def read_growth_dependant_rnap_alloc(): """ Read table with data on Π, the fraction of RNAP holoenzyme. We define Π: Π = holoRNAP / RNAP_total = holoRNAP / (holoRNAP + RNAP_free) :return: """ Pi = pd.read_csv(pjoin(data_dir,'neidhardt_tab3_active_rnap.csv'), header = 1, index_col=0) Pi = Pi.drop('/h', axis=1) return Pi/100
[docs]def get_growth_dependant_transformed_rnap_alloc(): """ We are given the active RNAP ratio Π, which we approximate to be Π = holoRNAP / RNAP_total = holoRNAP / (holoRNAP + RNAP_free) For our calculations, we are interested in q = holoRNAP / RNAP_free Π = holoRNAP / (holoRNAP + RNAP_free) <=> 1/Π = 1 + 1/q <=> 1/Π - 1 = 1/q <=> Π/(1 - Π) = q :return: """ Pi = read_growth_dependant_rnap_alloc() q = Pi / (1-Pi) mus = [float(x) for x in q.columns] return mus, q.to_numpy()[0]