Sequence features
Given a target protein sequence, leri extracts three types of sequence-based features involving position-specific scoring matrices (PSSM), secondary structure (SS), and solvent accessibility (SA). The PSSM is generated by PSIBLAST \cite{altschul1997gapped} search of the query against a non-redundant sequence database with 20 log-odds scores taken at each position. The secondary structure (SS) is predicted by PSI-PRED \cite{jones1999protein}, with the three states defined as alpha-helix, beta-strand, and coil. The solvent accessibility (SA) is predicted by the neural networks \cite{chen2005prediction}.
#!/bin/bash
dirt=${PWD}
#########################################################################
psiblast=${dirt}/lib/ncbi-blast-2.11.0+-src/c++/ReleaseMT/bin/psiblast
stride=/usr/bin/stride
NR=${dirt}/lib/database/nr.filter
database=~/.data/uniclust/UniRef30_2020_03
rama_basin=~/lesria/leri-projects/rama_map/leri-output/leria_rama_map/rama_5x5
leri=~/lesria/leri-build/tool/leri
n_threads=2
n_threads=1
postfix=${1}
lst=${dirt}/list_${postfix}
#########################################################################
dirt_data=${dirt}/data_${postfix}
dirt_fasta=${dirt_data}/fasta
dirt_msa=${dirt_data}/msa
dirt_pssm=${dirt_data}/pssm
dirt_ss=${dirt_data}/ss
dirt_stride=${dirt_data}/stride
dirt_re=${dirt_data}/re
dirt_dc=${dirt_data}/dc
dirt_pssp=${dirt_data}/pssp
dirt_ec=${dirt_data}/ec
dirt_rbp=${dirt_data}/rbp
dirt_tmp=${dirt_data}/tmp
mkdir -p ${dirt_fasta}
mkdir -p ${dirt_msa}
mkdir -p ${dirt_pssm}
mkdir -p ${dirt_ss}
mkdir -p ${dirt_stride}
mkdir -p ${dirt_re}
mkdir -p ${dirt_dc}
mkdir -p ${dirt_pssp}
mkdir -p ${dirt_ec}
mkdir -p ${dirt_rbp}
mkdir -p ${dirt_tmp}
# AA, PP, PSSM
# RE, DC, PSSP, RMP
n=`cat ${lst} | wc -l`
k=0
for i in `seq 1 ${n}`
# for i in `seq 1 1`
do
k=`expr ${k} + 1`
id=`head -${i} ${lst} | tail -1`
ch=${id:4:1}
id=${id:0:4}
echo "$id $ch $k/$n"
wget -q -nv https://files.rcsb.org/download/${id}.pdb -O ${dirt_data}/${id}.pdb
# FASTA
stride -q -c${ch} ${dirt_data}/${id}.pdb > ${dirt_fasta}/${id}${ch}.fasta
num=`head -1 ${dirt_fasta}/${id}${ch}.fasta | awk '{print $3}'`
# PHI
stride \
${dirt_data}/${id}.pdb \
-r${ch} \
| grep "ASG" | awk '{print $8}' \
> ${dirt_stride}/${id}${ch}.phi
# PSI
stride \
${dirt_data}/${id}.pdb \
-r${ch} \
| grep "ASG" | awk '{print $9}' \
> ${dirt_stride}/${id}${ch}.psi
# SS : real
stride \
${dirt_data}/${id}.pdb \
-r${ch} \
| grep "ASG" | grep " ${ch} " | awk '{printf("%s", $6)}' \
> ${dirt_ss}/${id}${ch}.ss
# PSSM
${psiblast} \
-db ${NR} \
-num_iterations 3 \
-num_threads 4 \
-out_ascii_pssm ${dirt_pssm}/${id}${ch}.pssm \
-query ${dirt_fasta}/${id}${ch}.fasta \
> /dev/null
# RE, DC, PSSP
flag_msa=0
# if [ ! -e "${dirt_tmp}/${id}${ch}_psiblast.a2m" ]
if [ ! -e "${dirt_msa}/${id}${ch}_msa.a2m" ]
then
flag_msa=1
fi
# if [[ ( -e "${dirt_tmp}/${id}${ch}_psiblast.a2m" ) && ( `wc -c ${dirt_tmp}/${id}${ch}_psiblast.a2m | awk '{print $1}'` -eq 0 ) ]]
if [[ ( -e "${dirt_msa}/${id}${ch}_msa.a2m" ) && ( `wc -c ${dirt_msa}/${id}${ch}_msa.a2m | awk '{print $1}'` -eq 0 ) ]]
then
flag_msa=1
fi
echo $flag_msa
if [ ${flag_msa} -eq 1 ]
then
hhblits \
-v 0 \
-d ${database} \
-cpu ${n_threads} \
-n 5 \
-E 0.001 \
-o ${dirt_tmp}/${id}${ch}.hhr \
-opsi ${dirt_tmp}/${id}${ch}_psiblast.aln \
-i ${dirt_fasta}/${id}${ch}.fasta \
> /dev/null
>${dirt_tmp}/${id}${ch}_psiblast.a2m
# cat ${seq_dirt}/${id}_WT.fasta >> ${tmp_dirt}/${id}_psiblast.a2m
# sed -i '1d' ${tmp_dirt}/${id}_psiblast.aln
nseq=`cat ${dirt_tmp}/${id}${ch}_psiblast.aln | wc -l`
echo $nseq
for j in `seq 1 ${nseq}`
do
name=`head -${j} ${dirt_tmp}/${id}${ch}_psiblast.aln | tail -1 | awk '{print $1}'`
seq=`head -${j} ${dirt_tmp}/${id}${ch}_psiblast.aln | tail -1 | awk '{print $2}'`
echo ">${name}" >> ${dirt_tmp}/${id}${ch}_psiblast.a2m
echo "${seq}" >> ${dirt_tmp}/${id}${ch}_psiblast.a2m
done
cat ${dirt_tmp}/${id}${ch}_psiblast.a2m >> ${dirt_msa}/${id}${ch}_msa.a2m
fi
${leri} sequence_stats \
-jobname ${id}${ch} \
-msa ${dirt_msa}/${id}${ch}_msa.a2m \
-threads ${n_threads} \
-output ${dirt_tmp}
${leri} sequence_eca \
-jobname ${id}${ch} \
-msa ${dirt_msa}/${id}${ch}_msa.a2m \
-threads ${n_threads} \
-output ${dirt_tmp}
# RE
mv ${dirt_tmp}/${id}${ch}/${id}${ch}_relative_entropy.csv ${dirt_re}/${id}${ch}_relative_entropy.csv
# DC
mv ${dirt_tmp}/${id}${ch}/${id}${ch}_degree_of_conservation.csv ${dirt_dc}/${id}${ch}_degree_of_conservation.csv
# PSSP
n_pssp=`expr ${num} + ${num}`
n_pssp=`expr ${n_pssp} + 3`
head -${n_pssp} ${dirt_tmp}/${id}${ch}/${id}${ch}_sequence_potential.csv > ${dirt_pssp}/${id}${ch}_pssp.csv
# EC
mv ${dirt_tmp}/${id}${ch}/${id}${ch}_amino_acid_coupling.csv ${dirt_ec}/${id}${ch}_amino_acid_coupling.csv
# RBP
for j in `seq 1 ${num}`
do
printf 'A'
done > ${dirt_tmp}/${id}${ch}/${id}${ch}_ALL.sec
${leri} rama_map \
-jobname ${id}${ch} \
-fastx ${dirt_fasta}/${id}${ch}.fasta \
-type 1 \
-dirt ${rama_basin} \
-index ${dirt_tmp}/${id}${ch}/${id}${ch}_ALL.sec \
-output ${dirt_tmp}
mkdir -p ${dirt_rbp}/${id}${ch}
mv ${dirt_tmp}/${id}${ch}/${id}${ch}_rama_map*.csv ${dirt_rbp}/${id}${ch}
rm -rf ${dirt_tmp}/${id}${ch}
rm -rf ${dirt_tmp}/${id}${ch}.hhr
rm -rf ${dirt_data}/${id}.pdb
rm -rf ${dirt_tmp}/${id}${ch}_psiblast*
done
Last updated