How to Create a Bioinformatics AI Agent Using Biopython for DNA and Protein Analysis

byrn
By byrn
6 Min Read


class BioPythonAIAgent:
   def __init__(self, email="[email protected]"):
       self.email = email
       Entrez.email = email
       self.sequences = {}
       self.analysis_results = {}
       self.alignments = {}
       self.trees = {}
  
   def fetch_sequence_from_ncbi(self, accession_id, db="nucleotide", rettype="fasta"):
       try:
           handle = Entrez.efetch(db=db, id=accession_id, rettype=rettype, retmode="text")
           record = SeqIO.read(handle, "fasta")
           handle.close()
           self.sequences[accession_id] = record
           return record
       except Exception as e:
           print(f"Error fetching sequence: {str(e)}")
           return None
  
   def create_sample_sequences(self):
       covid_spike = "MFVFLVLLPLVSSQCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRSSVLHSTQDLFLPFFSNVTWFHAIHVSGTNGTKRFDNPVLPFNDGVYFASTEKSNIIRGWIFGTTLDSKTQSLLIVNNATNVVIKVCEFQFCNDPFLGVYYHKNNKSWMESEFRVYSSANNCTFEYVSQPFLMDLEGKQGNFKNLREFVFKNIDGYFKIYSKHTPINLVRDLPQGFSALEPLVDLPIGINITRFQTLLALHRSYLTPGDSSSGWTAGAAAYYVGYLQPRTFLLKYNENGTITDAVDCALDPLSETKCTLKSFTVEKGIYQTSNFRVQPTESIVRFPNITNLCPFGEVFNATRFASVYAWNRKRISNCVADYSVLYNSASFSTFKCYGVSPTKLNDLCFTNVYADSFVIRGDEVRQIAPGQTGKIADYNYKLPDDFTGCVIAWNSNNLDSKVGGNYNYLYRLFRKSNLKPFERDISTEIYQAGSTPCNGVEGFNCYFPLQSYGFQPTNGVGYQPYRVVVLSFELLHAPATVCGPKKSTNLVKNKCVNFNFNGLTGTGVLTESNKKFLPFQQFGRDIADTTDAVRDPQTLEILDITPCSFGGVSVITPGTNTSNQVAVLYQDVNCTEVPVAIHADQLTPTWRVYSTGSNVFQTRAGCLIGAEHVNNSYECDIPIGAGICASYQTQTNSPRRARSVASQSIIAYTMSLGAENSVAYSNNSIAIPTNFTISVTTEILPVSMTKTSVDCTMYICGDSTECSNLLLQYGSFCTQLNRALTGIAVEQDKNTQEVFAQVKQIYKTPPIKDFGGFNFSQILPDPSKPSKRSFIEDLLFNKVTLADAGFIKQYGDCLGDIAARDLICAQKFNGLTVLPPLLTDEMIAQYTSALLAGTITSGWTFGAGAALQIPFAMQMAYRFNGIGVTQNVLYENQKLIANQFNSAIGKIQDSLSSTASALGKLQDVVNQNAQALNTLVKQLSSNFGAISSVLNDILSRLDKVEAEVQIDRLITGRLQSLQTYVTQQLIRAAEIRASANLAATKMSECVLGQSKRVDFCGKGYHLMSFPQSAPHGVVFLHVTYVPAQEKNFTTAPAICHDGKAHFPREGVFVSNGTHWFVTQRNFYEPQIITTDNTFVSGNCDVVIGIVNNTVYDPLQPELDSFKEELDKYFKNHTSPDVDLGDISGINASVVNIQKEIDRLNEVAKNLNESLIDLQELGKYEQYIKWPWYIWLGFIAGLIAIVMVTIMLCCMTSCCSCLKGCCSCGSCCKFDEDDSEPVLKGVKLHYT"
      
       human_insulin = "MALWMRLLPLLALLALWGPDPAAAFVNQHLCGSHLVEALYLVCGERGFFYTPKTRREAEDLQVGQVELGGGPGAGSLQPLALEGSLQKRGIVEQCCTSICSLYQLENYCN"
      
       e_coli_16s = "AAATTGAAGAGTTTGATCATGGCTCAGATTGAACGCTGGCGGCAGGCCTAACACATGCAAGTCGAACGGTAACAGGAAGCAGCTTGCTGCTTTGCTGACGAGTGGCGGACGGGTGAGTAATGTCTGGGAAACTGCCTGATGGAGGGGGATAACTACTGGAAACGGTAGCTAATACCGCATAATGTCGCAAGACCAAAGAGGGGGACCTTCGGGCCTCTTGCCATCGGATGTGCCCAGATGGGATTAGCTAGTAGGTGGGGTAACGGCTCACCTAGGCGACGATCCCTAGCTGGTCTGAGAGGATGACCAGCCACACTGGAACTGAGACACGGTCCAGACTCCTACGGGAGGCAGCAGTGGGGAATATTGCACAATGGGCGCAAGCCTGATGCAGCCATGCCGCGTGTATGAAGAAGGCCTTCGGGTTGTAAAGTACTTTCAGCGGGGAGGAAGGCGTTAAGGTTAATAACCTTGGCGATTGACGTTACCCGCAGAAGAAGCACCGGCTAACTCCGTGCCAGCAGCCGCGGTAATACGGAGGGTGCAAGCGTTAATCGGAATTACTGGGCGTAAAGCGCACGCAGGCGGTCTGTCAAGTCGGATGTGAAATCCCCGGGCTCAACCTGGGAACTGCATCTGATACTGGCAAGCTTGAGTCTCGTAGAGGGGGGTAGAATTCCAGGTGTAGCGGTGAAATGCGTAGAGATCTGGAGGAATACCGGTGGCGAAGGCGGCCCCCTGGACAAAGACTGACGCTCAGGTGCGAAAGCGTGGGGAGCAAACA"
      
       sample_sequences = [
           ("COVID_Spike", covid_spike, "SARS-CoV-2 Spike Protein"),
           ("Human_Insulin", human_insulin, "Human Insulin Precursor"),
           ("E_coli_16S", e_coli_16s, "E. coli 16S rRNA")
       ]
      
       for seq_id, seq_str, desc in sample_sequences:
           record = SeqRecord(Seq(seq_str), id=seq_id, description=desc)
           self.sequences[seq_id] = record
      
       return sample_sequences
  
   def analyze_sequence(self, sequence_id=None, sequence=None):
       if sequence_id and sequence_id in self.sequences:
           seq_record = self.sequences[sequence_id]
           seq = seq_record.seq
           description = seq_record.description
       elif sequence:
           seq = Seq(sequence)
           description = "Custom sequence"
       else:
           return None
      
       analysis = {
           'length': len(seq),
           'composition': {}
       }
      
       for base in ['A', 'T', 'G', 'C']:
           analysis['composition'][base] = seq.count(base)
      
       if 'A' in analysis['composition'] and 'T' in analysis['composition']:
           analysis['gc_content'] = round(gc_fraction(seq) * 100, 2)
           try:
               analysis['molecular_weight'] = round(molecular_weight(seq, seq_type="DNA"), 2)
           except:
               analysis['molecular_weight'] = len(seq) * 650
      
       try:
           if len(seq) % 3 == 0:
               protein = seq.translate()
               analysis['translation'] = str(protein)
               analysis['stop_codons'] = protein.count('*')
              
               if '*' not in str(protein)[:-1]:
                   prot_analysis = ProteinAnalysis(str(protein)[:-1])
                   analysis['protein_mw'] = round(prot_analysis.molecular_weight(), 2)
                   analysis['isoelectric_point'] = round(prot_analysis.isoelectric_point(), 2)
                   analysis['protein_composition'] = prot_analysis.get_amino_acids_percent()
       except:
           pass
      
       key = sequence_id if sequence_id else "custom"
       self.analysis_results[key] = analysis
      
       return analysis
  
   def visualize_composition(self, sequence_id):
       if sequence_id not in self.analysis_results:
           return
      
       analysis = self.analysis_results[sequence_id]
      
       fig = make_subplots(
           rows=2, cols=2,
           specs=[[{"type": "pie"}, {"type": "bar"}],
                  [{"colspan": 2}, None]],
           subplot_titles=("Nucleotide Composition", "Base Count", "Sequence Properties")
       )
      
       labels = list(analysis['composition'].keys())
       values = list(analysis['composition'].values())
      
       fig.add_trace(
           go.Pie(labels=labels, values=values, name="Composition"),
           row=1, col=1
       )
      
       fig.add_trace(
           go.Bar(x=labels, y=values, name="Count", marker_color=['red', 'blue', 'green', 'orange']),
           row=1, col=2
       )
      
       properties = ['Length', 'GC%', 'MW (kDa)']
       prop_values = [
           analysis['length'],
           analysis.get('gc_content', 0),
           analysis.get('molecular_weight', 0) / 1000
       ]
      
       fig.add_trace(
           go.Scatter(x=properties, y=prop_values, mode="markers+lines",
                     marker=dict(size=10, color="purple"), name="Properties"),
           row=2, col=1
       )
      
       fig.update_layout(
           title=f"Comprehensive Analysis: {sequence_id}",
           showlegend=False,
           height=600
       )
      
       fig.show()
  
   def perform_multiple_sequence_alignment(self, sequence_ids):
       if len(sequence_ids) < 2:
           return None
      
       sequences = []
       for seq_id in sequence_ids:
           if seq_id in self.sequences:
               sequences.append(self.sequences[seq_id])
      
       if len(sequences) < 2:
           return None
      
       from Bio.Align import PairwiseAligner
       aligner = PairwiseAligner()
       aligner.match_score = 2
       aligner.mismatch_score = -1
       aligner.open_gap_score = -2
       aligner.extend_gap_score = -0.5
      
       alignments = []
       for i in range(len(sequences)):
           for j in range(i+1, len(sequences)):
               alignment = aligner.align(sequences[i].seq, sequences[j].seq)[0]
               alignments.append(alignment)
      
       return alignments
  
   def create_phylogenetic_tree(self, alignment_key=None, sequences=None):
       if alignment_key and alignment_key in self.alignments:
           alignment = self.alignments[alignment_key]
       elif sequences:
           records = []
           for i, seq in enumerate(sequences):
               record = SeqRecord(Seq(seq), id=f"seq_{i}")
               records.append(record)
           SeqIO.write(records, "temp.fasta", "fasta")
          
           try:
               clustalw_cline = ClustalwCommandline("clustalw2", infile="temp.fasta")
               stdout, stderr = clustalw_cline()
               alignment = AlignIO.read("temp.aln", "clustal")
               os.remove("temp.fasta")
               os.remove("temp.aln")
               os.remove("temp.dnd")
           except:
               return None
       else:
           return None
      
       calculator = DistanceCalculator('identity')
       dm = calculator.get_distance(alignment)
      
       constructor = DistanceTreeConstructor()
       tree = constructor.upgma(dm)
      
       tree_key = f"tree_{len(self.trees)}"
       self.trees[tree_key] = tree
      
       return tree
  
   def visualize_tree(self, tree):
       fig, ax = plt.subplots(figsize=(10, 6))
       Phylo.draw(tree, axes=ax)
       plt.title("Phylogenetic Tree")
       plt.tight_layout()
       plt.show()
  
   def protein_structure_analysis(self, sequence_id):
       if sequence_id not in self.sequences:
           return None
      
       seq = self.sequences[sequence_id].seq
      
       try:
           if len(seq) % 3 == 0:
               protein = seq.translate()
               if '*' not in str(protein)[:-1]:
                   prot_analysis = ProteinAnalysis(str(protein)[:-1])
                  
                   structure_analysis = {
                       'molecular_weight': prot_analysis.molecular_weight(),
                       'isoelectric_point': prot_analysis.isoelectric_point(),
                       'amino_acid_percent': prot_analysis.get_amino_acids_percent(),
                       'secondary_structure': prot_analysis.secondary_structure_fraction(),
                       'flexibility': prot_analysis.flexibility(),
                       'gravy': prot_analysis.gravy()
                   }
                  
                   return structure_analysis
       except:
           pass
      
       return None
  
   def comparative_analysis(self, sequence_ids):
       results = []
      
       for seq_id in sequence_ids:
           if seq_id in self.analysis_results:
               analysis = self.analysis_results[seq_id].copy()
               analysis['sequence_id'] = seq_id
               results.append(analysis)
      
       df = pd.DataFrame(results)
      
       if len(df) > 1:
           fig = make_subplots(
               rows=2, cols=2,
               subplot_titles=("Length Comparison", "GC Content", "Molecular Weight", "Composition Heatmap")
           )
          
           fig.add_trace(
               go.Bar(x=df['sequence_id'], y=df['length'], name="Length"),
               row=1, col=1
           )
          
           if 'gc_content' in df.columns:
               fig.add_trace(
                   go.Scatter(x=df['sequence_id'], y=df['gc_content'], mode="markers+lines", name="GC%"),
                   row=1, col=2
               )
          
           if 'molecular_weight' in df.columns:
               fig.add_trace(
                   go.Bar(x=df['sequence_id'], y=df['molecular_weight'], name="MW"),
                   row=2, col=1
               )
          
           fig.update_layout(title="Comparative Sequence Analysis", height=600)
           fig.show()
      
       return df
  
   def codon_usage_analysis(self, sequence_id):
       if sequence_id not in self.sequences:
           return None
      
       seq = self.sequences[sequence_id].seq
      
       if len(seq) % 3 != 0:
           return None
      
       codons = {}
       for i in range(0, len(seq) - 2, 3):
           codon = str(seq[i:i+3])
           codons[codon] = codons.get(codon, 0) + 1
      
       codon_df = pd.DataFrame(list(codons.items()), columns=['Codon', 'Count'])
       codon_df = codon_df.sort_values('Count', ascending=False)
      
       fig = px.bar(codon_df.head(20), x='Codon', y='Count',
                    title=f"Top 20 Codon Usage - {sequence_id}")
       fig.show()
      
       return codon_df
  
   def motif_search(self, sequence_id, motif_pattern):
       if sequence_id not in self.sequences:
           return []
      
       seq = str(self.sequences[sequence_id].seq)
       positions = []
      
       for i in range(len(seq) - len(motif_pattern) + 1):
           if seq[i:i+len(motif_pattern)] == motif_pattern:
               positions.append(i)
      
       return positions
  
   def gc_content_window(self, sequence_id, window_size=100):
       if sequence_id not in self.sequences:
           return None
      
       seq = self.sequences[sequence_id].seq
       gc_values = []
       positions = []
      
       for i in range(0, len(seq) - window_size + 1, window_size//4):
           window = seq[i:i+window_size]
           gc_values.append(gc_fraction(window) * 100)
           positions.append(i + window_size//2)
      
       fig = go.Figure()
       fig.add_trace(go.Scatter(x=positions, y=gc_values, mode="lines+markers",
                               name=f'GC Content (window={window_size})'))
       fig.update_layout(
           title=f"GC Content Sliding Window Analysis - {sequence_id}",
           xaxis_title="Position",
           yaxis_title="GC Content (%)"
       )
       fig.show()
      
       return positions, gc_values
  
   def run_comprehensive_analysis(self, sequence_ids):
       results = {}
      
       for seq_id in sequence_ids:
           if seq_id in self.sequences:
               analysis = self.analyze_sequence(seq_id)
               self.visualize_composition(seq_id)
              
               gc_analysis = self.gc_content_window(seq_id)
               codon_analysis = self.codon_usage_analysis(seq_id)
              
               results[seq_id] = {
                   'basic_analysis': analysis,
                   'gc_window': gc_analysis,
                   'codon_usage': codon_analysis
               }
      
       if len(sequence_ids) > 1:
           comparative_df = self.comparative_analysis(sequence_ids)
           results['comparative'] = comparative_df
      
       return results



Source link

Share This Article
Leave a Comment

Leave a Reply

Your email address will not be published. Required fields are marked *