Usages
Generating MSA
Generate an MSA by hhblits
hhblits
# Use HHBLITS to generate MSA
tmp_dirt=<temporary directory>
n_threads=4 # number of threads
id=1A2YA # PDB ID and chain of an example
hhblits \
-v 0 \
-d /<directory_to_the_database>/uniclust/UniRef30_2020_03 \
-cpu ${n_threads} \
-n 5 \
-E 0.001 \
-o ${tmp_dirt}/${id}.hhr \
-opsi ${tmp_dirt}/${id}_psiblast.aln \
-i /<directory_to_the_FASTA>/${id}.fasta
Generate an MSA by jackhmmer
jackhmmer
jackhmmer \
--notextw \
--cpu ${n_threads} \
-A ${tmp_dirt}/${id}.sto \
--tblout ${tmp_dirt}/${id}_tbl.out \
--domtblout ${tmp_dirt}/${id}_domtbl.out \
-E 0.01 \
--popen 0.25 \
--pextend 0.4 \
${seq_dirt}/${id}.fasta \
/<directory_to_the_database>/uniref100.fasta \
# Convert format
amo sequence_converter \
-jobname ${id} \
-type 1 \
-msa ${tmp_dirt}/${id}.sto \
-threads ${n_threads} \
-output ${out_dirt}
Usage of AMO
amo: usage command line
amo <command> <args>
These are common Ler commands used in various situations:
get amo and system info. (see also: amo help tutorial)
about -about amo
usage -usage of amo
work on the omics (see also: amo help metaomics)
sff2fastq -convert an SFF file from the 454 genome sequencer
to FASTQ including the sequences and quality scores
base_call -Base calling
atgc_stats -Statistics of FASTQ sequence
atgc_quality -Quality control
atgc_correct -Correct errors in sequencing data
atgc_assembly -sequence assembly by de Bruijn graph or ant colony optimization
read_filter -Filter reads with low quality
kmer_counter -count kmer
kmer_generator -generate kmers
work on the proteomics (see also: amo help proteomics)
sequence_converter -convert the ouput of Jackhmmer to fasta
sequence_mix -pairwise-mixture protein sequences (particular for GPCR-G proteins)
sequence_trim -trim a multiple protein sequence alignment
sequence_stats -statistical analysis on a protein mutiple sequence alignment
sequence_coupling -coupled relationship betwen pairwise protein residues and functional sectors
sequence_eca -infer coupled blocks of protein residues at conserved sites
sequence_energy -calculate energy for a protein sequence or multiple sequences
sequence_design -design a protein sequence
sequence_ddg -infer protein stability (ddG = dG_wt - dG_mutant)
point_mutation -point mutation for a given protein sequence
residue_contact -estimate contacts between pairwise residues from protein MSA
selectivity_sector -detect selectivity sectors of determinants in GPCR-G proteins
work on protein folding
pdb_parser -parser a PDB file and write it out in simplified PDB format and/or FASTA sequence file
fasta2pdb -convert protien amino acid sequence into its extended structure
fold_protein -predict tertiary structure
protein_folding -predict folding pathways & tertiary structure
calc_contact -calculate residue contacts from a give PDB file
work on statistics (see also: amo help statistics)
stats_ica -independent component analysis on a given matrix
stats_pearson -Pearson correlation
stats_tsne -t-distributed stochastic neighbor embedding (t-SNE) method
work on optimization tools (see also: amo help optimization)
opt_chpso -the convergent heterogeneous particle swarm optimizer
opt_jde -the differential evolution optimizer
work on sampling (see also: amo help sampling)
nested_sampler -nested sampler
work on machine learning (see also: amo help learning)
senet -sequentially evolving neural network
optifel -evolving fuzzy modeling approach
phsior -protein torsion angle predictor based on CNN deep network
Example pipeline
#!/bin/bash
jackhmmer=/usr/local/bin/jackhmmer
usrname=`whoami`
home_dirt=/home/${usrname}
dirt=${home_dirt}/amo-ai
#lib_dirt=${dirt}/amo-lib
amo=${dirt}/amo-build/tool/amo
data_idx=90
database=${home_dirt}/.data/uniref${data_idx}.fasta
out_dirt=${dirt}/amo-output
#bin_dir=${dirt}/amo-bin
#srp_dir=${dirt}/amo-scripts
exp_dirt=${dirt}/amo-example
seq_dirt=${exp_dirt}/fasta
if [ $# -ne 1 ]; then
echo "Usage: $0 <protein name>"
echo "where the <protein name> is the same as that of it FASTA."
exit
fi
protein=$1
job_name=${protein}
n_thread=2
start=$(date +%s.%N)
if [ ! -d ${out_dirt}/${job_name} ]
then
mkdir -p ${out_dirt}/${job_name}
fi
fasta=${seq_dirt}/${protein}.fasta
fasta_wt=${out_dirt}/${job_name}/${protein}_WT.fasta
# date +"%T-%m-%d-%y"
printf "## AMO-AI is working on protein %s \n" "${protein}"
# Pre-process
printf "## L1. Obtaining FASTA of %s ...\n" "${protein}"
n=`cat ${fasta} | wc | awk '{print $1}'`
echo "`head -1 ${fasta}`" > ${fasta_wt}
for i in `seq 2 ${n}`
do
cat ${fasta} | head -${i} | tail -1 | while read line; do printf "%s" "$line"; done >> ${fasta_wt}
done
printf "## Protein name: %s\n" "${protein}"
printf "## Residue number: %s\n" "`tail -1 ${fasta_wt} | wc -c`"
# echo "## `tail -1 ${fasta_wt}`"
printf "## L2. Searhing protein (%s) against UniRef%s database ...\n" "${protein}" "${data_idx}"
flag_jackhmmer=0
if [ ! -f ${out_dirt}/${job_name}/${protein}.sto ]; then
flag_jackhmmer=1
elif [[ -f ${out_dirt}/${job_name}/${protein}.sto && `ls -s ${out_dirt}/${job_name}/${protein}.sto | awk '{print $1}'` -eq 0 ]]; then
flag_jackhmmer=1
fi
if [ ${flag_jackhmmer} -eq 1 ]; then
start=`date +%s`
$jackhmmer --notextw \
--cpu ${n_thread} \
-A ${out_dirt}/${job_name}/${protein}.sto \
--tblout ${out_dirt}/${job_name}/${protein}_tbl.out \
--domtblout ${out_dirt}/${job_name}/${protein}_domtbl.out \
-E 0.01 \
--popen 0.25 \
--pextend 0.4 \
${fasta_wt} \
${database} \
>/dev/null
#--mxfile ${lib_dirt}/common/BLOSUM62.mat \
# -T 0.4 \ bit scorce
end=`date +%s`
run_time=$((end-start))
printf "## Running time: %f" ${run_time}
fi
printf "## L3. Converting from the file (*.sto) of %s generated by jackhmmer to *.a2m ...\n" "${protein}"
${amo} sequence_converter \
-jobname ${job_name} \
-msa ${out_dirt}/${job_name}/${protein}.sto \
-threads ${n_thread} \
-output ${out_dirt}/
printf "## L4. Trimming the obtained MSA of %s ...\n" "${protein}"
${amo} sequence_trim \
-jobname ${job_name} \
-msa ${out_dirt}/${job_name}/${job_name}_msa.a2m \
-gap_frac 0.15 \
-threads ${n_thread} \
-output ${out_dirt}/
printf "## L4. Compute statistics on the aligned sequences of protein %s ...\n" "${protein}"
${amo} sequence_stats \
-jobname ${job_name} \
-msa ${out_dirt}/${job_name}/${job_name}_msa_trimmed.aln \
-threads ${n_thread} \
-max_iter 1000 \
-output ${out_dirt}/
printf "## L5. Compute coupling on the aligned sequences of protein %s ...\n" "${protein}"
${amo} sequence_coupling \
-jobname ${job_name} \
-msa ${out_dirt}/${job_name}/${job_name}_msa.a2m \
-threads ${n_thread} \
-max_iter 10 \
-output ${out_dirt}/
printf "## L6. Point_mutation for protein %s\n" "${protein}"
${amo} point_mutation \
-jobname ${job_name} \
-threads ${n_thread} \
-fastx ${fasta_wt} \
-mat ${out_dirt}/${job_name}/${job_name}_sequence_potential.csv \
-output ${out_dirt}/
printf "## L7. Design sequence for protein %s\n" "${protein}"
${amo} sequence_design \
-jobname ${job_name} \
-threads ${n_thread} \
-fastx ${fasta_wt} \
-mat ${out_dirt}/${job_name}/${job_name}_sequence_potential.csv \
-max_iter 100000 \
-output ${out_dirt}/
printf "## L8. Compute sequence energy for protein %s\n" "${protein}"
${amo} sequence_energy \
-jobname ${job_name} \
-threads ${n_thread} \
-msa ${out_dirt}/${job_name}/${job_name}_msa_trimmed.aln \
-mat ${out_dirt}/${job_name}/${job_name}_sequence_potential.csv \
-output ${out_dirt}/
end=$(date +%s.%N)
runtime=$(python -c "print(${end} - ${start})")
printf "## L9. Completed in %f\n\n" "${runtime}"
#date +"%T-%m-%d-%y"
Last updated