import pandas as pd
import re
def extract_tp53_features(content):
"""Extracts features from TP53 KEGG record content (string)."""
features = {}
# --- Example: Extract pathway information ---
if "Pathway" in content:
pathways = re.findall(r"hsa\d+", content.split("Pathway")[1].split("Network")[0])
features['TP53_in_Cell_Cycle'] = 1 if "hsa04110" in pathways else 0
# Add more pathway features here...
# --- Example: Extract disease information ---
if "Disease" in content:
diseases = [d.strip() for d in content.split("Disease")[1].split("Drug target")[0].split("\n") if "H" in d]
features['TP53_in_Cancer'] = 1 if any("cancer" in d.lower() for d in diseases) else 0
# Add more disease features here...
# --- Extract drug target information ---
if "Drug target" in content:
drugs = [d.strip() for d in content.split("Drug target")[1].split("Brite")[0].split("\n") if "D" in d]
features['TP53_is_Drug_Target'] = 1 if drugs else 0
features['TP53_Num_Drug_Target'] = len(drugs)
return features
# Sample data (paste the content of your file directly here)
data = """
Homo sapiens (human): 7157 Help
Entry
7157 CDS T01001
Symbol
TP53, BCC7, BMFS5, LFS1, P53, TRP53
Name
(RefSeq) tumor protein p53
KO
K04451 tumor protein p53
Organism
hsa Homo sapiens (human)
Pathway
hsa01522 Endocrine resistance
hsa01524 Platinum drug resistance
hsa04010 MAPK signaling pathway
hsa04071 Sphingolipid signaling pathway
hsa04110 Cell cycle
hsa04115 p53 signaling pathway
hsa04137 Mitophagy - animal
hsa04151 PI3K-Akt signaling pathway
hsa04210 Apoptosis
hsa04211 Longevity regulating pathway
hsa04216 Ferroptosis
hsa04218 Cellular senescence
hsa04310 Wnt signaling pathway
hsa04722 Neurotrophin signaling pathway
hsa04919 Thyroid hormone signaling pathway
hsa05012 Parkinson disease
hsa05014 Amyotrophic lateral sclerosis
hsa05016 Huntington disease
hsa05131 Shigellosis
hsa05160 Hepatitis C
hsa05161 Hepatitis B
hsa05162 Measles
hsa05163 Human cytomegalovirus infection
hsa05165 Human papillomavirus infection
hsa05166 Human T-cell leukemia virus 1 infection
hsa05167 Kaposi sarcoma-associated herpesvirus infection
hsa05168 Herpes simplex virus 1 infection
hsa05169 Epstein-Barr virus infection
hsa05200 Pathways in cancer
hsa05202 Transcriptional misregulation in cancer
hsa05203 Viral carcinogenesis
hsa05205 Proteoglycans in cancer
hsa05206 MicroRNAs in cancer
hsa05210 Colorectal cancer
hsa05212 Pancreatic cancer
hsa05213 Endometrial cancer
hsa05214 Glioma
hsa05215 Prostate cancer
hsa05216 Thyroid cancer
hsa05217 Basal cell carcinoma
hsa05218 Melanoma
hsa05219 Bladder cancer
hsa05220 Chronic myeloid leukemia
hsa05222 Small cell lung cancer
hsa05223 Non-small cell lung cancer
hsa05224 Breast cancer
hsa05225 Hepatocellular carcinoma
hsa05226 Gastric cancer
hsa05230 Central carbon metabolism in cancer
hsa05417 Lipid and atherosclerosis
hsa05418 Fluid shear stress and atherosclerosis
Network
nt06160 Human T-cell leukemia virus 1 (HTLV-1)
nt06162 Hepatitis B virus (HBV)
nt06163 Hepatitis C virus (HCV)
nt06164 Kaposi sarcoma-associated herpesvirus (KSHV)
nt06165 Epstein-Barr virus (EBV)
nt06166 Human papillomavirus (HPV)
nt06167 Human cytomegalovirus (HCMV)
nt06168 Herpes simplex virus 1 (HSV-1)
nt06169 Measles virus (MV)
nt06170 Influenza A virus (IAV)
nt06230 Cell cycle (cancer)
nt06240 Transcription (cancer)
nt06260 Colorectal cancer
nt06261 Gastric cancer
nt06262 Pancreatic cancer
nt06263 Hepatocellular carcinoma
nt06265 Bladder cancer
nt06266 Non-small cell lung cancer
nt06267 Small cell lung cancer
nt06268 Melanoma
nt06269 Basal cell carcinoma
nt06270 Breast cancer
nt06271 Endometrial cancer
nt06273 Glioma
nt06274 Thyroid cancer
nt06276 Chronic myeloid leukemia
nt06461 Huntington disease
nt06463 Parkinson disease
Element
N00066 MDM2-p21-Cell cycle G1/S
N00067 Deleted p14(ARF) to p21-cell cycle G1/S
N00068 Amplified MDM2 to p21-cell cycle G1/S
N00076 Mutation-inactivated p14(ARF) to p21-cell cycle G1/S
N00115 Mutation-inactivated TP53 to transcription
N00131 Amplified MYCN to transcriptional activation
N00167 KSHV vIRF1/3 to p21-cell cycle G1/S
N00169 KSHV LANA to p21-cell cycle G1/S
N00223 EBV EBNA1 to p53-mediated transcription
N00263 EBV EBNA3C to p53-mediated transcription
N00347 p300-p21-Cell cycle G1/S
N00358 HPV E6 to p21-cell cycle G1/S
N00420 HCMV IE2-86 to p21-cell cycle G1/S
N00481 EBV BZLF1 to p53-mediated transcription
N00497 HTLV-1 Tax to p21-cell cycle G1/S
N00499 ATR-p21-Cell cycle G2/M
N00520 HCV NS5A to p21-cell cycle G1/S
N00521 HCV Core to p21-cell cycle G1/S
N00522 HCV NS3 to p21-cell cycle G1/S
N00535 HBV HBx to p53-mediated transcription
N00536 MDM2-p21-Cell cycle G1/S
N00592 HSV ICP0 to p53-mediated transcription
N00697 HV P to p53-mediated transcription
N00982 Mutation-caused aberrant Htt to p53-mediated transcription
N01058 Mutation-inactivated DJ1 to to p53-mediated transcription
Disease
H00004 Chronic myeloid leukemia
H00005 Chronic lymphocytic leukemia
H00006 Hairy cell leukemia
H00008 Burkitt lymphoma
H00009 Adult T-cell leukemia
H00010 Multiple myeloma
H00013 Small cell lung cancer
H00014 Non-small cell lung cancer
H00015 Malignant pleural mesothelioma
H00016 Oral cancer
H00017 Esophageal cancer
H00018 Gastric cancer
H00019 Pancreatic cancer
H00020 Colorectal cancer
H00022 Bladder cancer
H00025 Penile cancer
H00026 Endometrial cancer
H00027 Ovarian cancer
H00028 Choriocarcinoma
H00029 Vulvar cancer
H00031 Breast cancer
H00032 Thyroid cancer
H00033 Adrenal carcinoma
H00036 Osteosarcoma
H00038 Melanoma
H00039 Basal cell carcinoma
H00040 Squamous cell carcinoma
H00041 Kaposi sarcoma
H00042 Glioma
H00044 Cancer of the anal canal
H00046 Cholangiocarcinoma
H00047 Gallbladder cancer
H00048 Hepatocellular carcinoma
H00055 Laryngeal cancer
H00881 Li-Fraumeni syndrome
H01007 Choroid plexus papilloma
H01463 Mycosis fungoides
H01464 Mantle cell lymphoma
H01470 Giant cell tumor of bone
H01554 Fallopian tube cancer
H01555 Merkel cell carcinoma
H01557 Hepatic angiosarcoma
H01559 Oropharyngeal cancer
H01667 Medulloblastoma
H02301 Nephroblastoma
H02411 Chronic myelomonocytic leukemia
H02434 Diffuse large B-cell lymphoma, not otherwise specified
H02529 Bone marrow failure syndrome
Drug target
Cenersen sodium: D08887
Rezatapopt: D12982
Brite
KEGG Orthology (KO) [BR:hsa00001]
09130 Environmental Information Processing
09132 Signal transduction
04310 Wnt signaling pathway
7157 (TP53)
04071 Sphingolipid signaling pathway
7157 (TP53)
04151 PI3K-Akt signaling pathway
7157 (TP53)
09140 Cellular Processes
09141 Transport and catabolism
04137 Mitophagy - animal
7157 (TP53)
09150 Organismal Systems
09152 Endocrine system... 04919 Thyroid hormone signaling pathway
7157 (TP53)
09156 Nervous system
04722 Neurotrophin signaling pathway
7157 (TP53)
09149 Aging
04211 Longevity regulating pathway
7157 (TP53)
09160 Human Diseases
09161 Cancer: overview
05200 Pathways in cancer
7157 (TP53)
05202 Transcriptional misregulation in cancer
7157 (TP53)
05206 MicroRNAs in cancer
7157 (TP53)
05205 Proteoglycans in cancer
7157 (TP53)
05203 Viral carcinogenesis
7157 (TP53)
05230 Central carbon metabolism in cancer
7157 (TP53)
09162 Cancer: specific types
05210 Colorectal cancer
7157 (TP53)
05212 Pancreatic cancer
7157 (TP53)
05225 Hepatocellular carcinoma
7157 (TP53)
05226 Gastric cancer
7157 (TP53)
05214 Glioma
7157 (TP53)
05216 Thyroid cancer
7157 (TP53)
05220 Chronic myeloid leukemia
7157 (TP53)
05217 Basal cell carcinoma
7157 (TP53)
05218 Melanoma
7157 (TP53)
05219 Bladder cancer
7157 (TP53)
05215 Prostate cancer
7157 (TP53)
05213 Endometrial cancer
7157 (TP53)
05224 Breast cancer
7157 (TP53)
05222 Small cell lung cancer
7157 (TP53)
05223 Non-small cell lung cancer
7157 (TP53)
09172 Infectious disease: viral
05166 Human T-cell leukemia virus 1 infection
7157 (TP53)
05161 Hepatitis B
7157 (TP53)
05160 Hepatitis C
7157 (TP53)
05162 Measles
7157 (TP53)
05163 Human cytomegalovirus infection
7157 (TP53)
05167 Kaposi sarcoma-associated herpesvirus infection
7157 (TP53)
05169 Epstein-Barr virus infection
7157 (TP53)
05165 Human papillomavirus infection
7157 (TP53)
09171 Infectious disease: bacterial
05131 Shigellosis
7157 (TP53)
09164 Neurodegenerative disease
05012 Parkinson disease
7157 (TP53)
05014 Amyotrophic lateral sclerosis
7157 (TP53)
05016 Huntington disease
7157 (TP53)
09166 Cardiovascular disease
05417 Lipid and atherosclerosis
7157 (TP53)
05418 Fluid shear stress and atherosclerosis
7157 (TP53)
09176 Drug resistance: antineoplastic
01524 Platinum drug resistance
7157 (TP53)
01522 Endocrine resistance
7157 (TP53)
09180 Brite Hierarchies
09182 Protein families: genetic information processing
03000 Transcription factors [BR:hsa03000]
7157 (TP53)
03036 Chromosome and associated proteins [BR:hsa03036]
7157 (TP53)
03400 DNA repair and recombination proteins [BR:hsa03400]
7157 (TP53)
Transcription factors [BR:hsa03000]
Eukaryotic type
beta-Scaffold factors with minor groove contacts
p53
7157 (TP53)
Chromosome and associated proteins [BR:hsa03036]
Eukaryotic type
Sister chromatid separation proteins
Aurora kinases
Regulators of Aurora kinases
7157 (TP53)
DNA repair and recombination proteins [BR:hsa03400]
Eukaryotic type
Check point factors
Other check point factors
7157 (TP53)
BRITE hierarchy
SSDB OrthologParalogGene clusterGFIT
Motif
Pfam: P53 TAD2 P53_tetramer P53_TAD
Motif
Other DBs
NCBI-GeneID: 7157
NCBI-ProteinID: NP_000537
OMIM: 191170
HGNC: 11998
Ensembl: ENSG00000141510
UniProt: P04637 K7PPA8 Q53GA5
Structure PDBPDBj
Position
17:complement(7668421..7687490)
Genome browser
AA seq 393 aa AA seqDB search
MEEPQSDPSVEPPLSQETFSDLWKLLPENNVLSPLPSQAMDDLMLSPDDIEQWFTEDPGP
DEAPRMPEAAPPVAPAPAAPTPAAPAPAPSWPLSSSVPSQKTYQGSYGFRLGFLHSGTAK
SVTCTYSPALNKMFCQLAKTCPVQLWVDSTPPPGTRVRAMAIYKQSQHMTEVVRRCPHHE
RCSDSDGLAPPQHLIRVEGNLRVEYLDDRNTFRHSVVVPYEPPEVGSDCTTIHYNYMCNS
SCMGGMNRRPILTIITLEDSSGNLLGRNSFEVRVCACPGRDRRTEEENLRKKGEPHHELP
PGSTKRALPNNTSSSPQPKKKPLDGEYFTLQIRGRERFEMFRELNEALELKDAQAGKEPG
GSRAHSSHLKSKKGQSTSRHKKLMFKTEGPDSD
NT seq 1182 nt NT seq
atggaggagccgcagtcagatcctagcgtcgagccccctctgagtcaggaaacattttca
gacctatggaaactacttcctgaaaacaacgttctgtcccccttgccgtcccaagcaatg
gatgatttgatgctgtccccggacgatattgaacaatggttcactgaagacccaggtcca
gatgaagctcccagaatgccagaggctgctccccccgtggcccctgcaccagcagctcct
acaccggcggcccctgcaccagccccctcctggcccctgtcatcttctgtcccttcccag
aaaacctaccagggcagctacggtttccgtctgggcttcttgcattctgggacagccaag
tctgtgacttgcacgtactcccctgccctcaacaagatgttttgccaactggccaagacc
tgccctgtgcagctgtgggttgattccacacccccgcccggcacccgcgtccgcgccatg
gccatctacaagcagtcacagcacatgacggaggttgtgaggcgctgcccccaccatgag
cgctgctcagatagcgatggtctggcccctcctcagcatcttatccgagtggaaggaaat
ttgcgtgtggagtatttggatgacagaaacacttttcgacatagtgtggtggtgccctat
gagccgcctgaggttggctctgactgtaccaccatccactacaactacatgtgtaacagt
tcctgcatgggcggcatgaaccggaggcccatcctcaccatcatcacactggaagactcc
agtggtaatctactgggacggaacagctttgaggtgcgtgtttgtgcctgtcctgggaga
gaccggcgcacagaggaagagaatctccgcaagaaaggggagcctcaccacgagctgccc
ccagggagcactaagcgagcactgcccaacaacaccagctcctctccccagccaaagaag
aaaccactggatggagaatatttcacccttcagatccgtgggcgtgagcgcttcgagatg
ttccgagagctgaatgaggccttggaactcaaggatgcccaggctgggaaggagccaggg
gggagcagggctcactccagccacctgaagtccaaaaagggtcagtctacctcccgccat
aaaaaactcatgttcaagacagaagggcctgactcagactga
"""
# Extract features from the data
tp53_features = extract_tp53_features(data)
# Create a Pandas Series to easily add it to an existing DataFrame:
tp53_series = pd.Series(tp53_features)
# Sample combined dataset (replace with your actual data)
combined_data = {'Gene': ['TP53'],
'Other_Feature_1': [0.5],
'Other_Feature_2': [1.2]}
combined_df = pd.DataFrame(combined_data)
combined_df = combined_df.set_index('Gene') # Set 'Gene' as index
# Add extracted features to the combined dataset using .join, the indexes must match
combined_df = combined_df.join(tp53_series.to_frame().T)
print(combined_df)
#Now the combined_df is ready for further processing in machine learning
TP53 PATHWAY CODE © 2025 by Mrinmoy Chakraborty is licensed under CC BY-NC-ND 4.0
No comments:
Post a Comment