使用例
#@ faidx
data=genome.fa
cat $data |
awk 'BEGIN {
OFMT="%.10g"
OFS="\t"
getline; if($1!~/^>/) exit;
offset0 = (length($0)+1)
offset = offset0
sub(/^>/,"",$1); name=$1
}
{
offset += (length($0)+1)
if(/^>/) {
print name, seqlength, offset0, linebases, linebases + 1
sub(/^>/,"",$1); name = $1
offset0 = offset
seqlength = 0
linebases = 0
} else {
if(linebases==0) linebases = length($0)
seqlength += length($0)
}
}
END {print name, seqlength, offset0, linebases, linebases+1}'
#@ g2s
awk '$3=="transcript"' |
cut -f9 |
cut -d';' -f1,4 |
tr -d ';"' |
awk '$1!~/_PAR_Y$/' | # You can reduce this step
awk '{sub(/\..*/,"",$2) ;
print $2,$4}' |
sort -u
#@ t2g
awk '$3=="transcript"' |
cut -f9 |
cut -d';' -f1,2 |
cut -d' ' -f2,4 |
tr -d ';"' |
awk '$1!~/_PAR_Y$/' | # You can remove this step
awk '{sub(/\..*/,"",$1) ;
sub(/\..*/,"",$2) ;
print $2,$1}' |
sort -u
#@ col_9
data=data.gtf
#
cat $data |
awk -F'\t' '$3=="transcript"{print $9}' |
awk '{n=split($0, a,";"); for(i=1;i<=n-1;i++) print NR, i, a[i]}'
#
cat $data |
awk -F'\t' '$3=="transcript"{
split($9, a,";")
sub(/ [^ ]*$/,"",a[1])
if(!u[a[1]]++) print a[1] }'
#@ GTF_GFF_1_8
tmp=/tmp/$(date +%Y%m%d%H%M%S)_$$
data_1=data.gtf
data_2=data.gff3
{
cat $data_1 |
grep -v '^#' |
awk '!a[$3]++{print $3}'
cat $data_2 |
grep -v '^#' |
awk '!a[$3]++{print $3}'
} |
awk '++a[$1]-1' | sort |
while read i ; do
cat $data_1 |
grep -v '^#' |
awk -F'\t' '$3=="'$i'"{$9=""; print}' > $tmp-1
cat $data_2 |
grep -v '^#' |
awk -F'\t' '$3=="'$i'"{$9=""; print}' > $tmp-2
cmp -s $tmp-1 $tmp-2
echo $i $?
done | { echo "=== GTF vs GFF:1/8fld ==="; cat - ;}
rm -f $tmp*
#@ intron
# INPUT: GENCODE.gtf
# OUTPUT:
# 1 ENS(MUS)T
# 2 ENS(MUS)G
# 3 Gene Name
# 4 Chr
# 5 Start
# 6 End
# 7 Strand
# 8 Exon/Intron number
# 9 Gene_biotype
# 10 Tx_biotype
grep -v '^#' |
awk -F'\t' '$3=="exon"{print $1, $4, $5, $7, $9}' |
awk '{ sub(/\..*/,"",$6) #
sub(/\..*/,"",$8) #
print $8, $6, $12, $1, $2, $3, $4, $10, $14}' |
tr -d '";' |
awk 'BEGIN{k0 = $1; s0 = $5; e0 = $6} #
{ k = $1; s = $5; e = $6 #
if(k!=k0){ k0 = k; s0 = s; e0 = e } #
else { #
if ($7=="+") { $5=e0+1; $6=s-1; $7=$7" "++a[$1]; print} #
else { $5=e+1; $6=s0-1; $7=$7" "++a[$1]; print} #
s0 = s; e0 = e } #
}'
#@ exon
# INPUT: GENCODE.gtf
# OUTPUT:
# 1 ENS(MUS)T
# 2 ENS(MUS)G
# 3 Gene Name
# 4 Chr
# 5 Start
# 6 End
# 7 Strand
# 8 Exon/Intron number
# 9 Gene_biotype
# 10 Tx_biotype
grep -v '^#' |
awk -F'\t' '$3=="exon"{print $1, $4, $5, $7, $9}' |
awk '{ sub(/\..*/,"",$6) #
sub(/\..*/,"",$8) #
print $8, $6, $12, $1, $2, $3, $4,++a[$8], $10, $14}' |
tr -d '";'
##### DOWNLOAD
human_fr=37
human_to=39
mouse_fr=26
mouse_to=28
url=http://ftp.ebi.ac.uk/pub/databases/gencode
url_h=$url/Gencode_human
url_m=$url/Gencode_mouse
#@
# METADATA
curl -o README $url/_README.TXT
curl -o README_STATS $url/_README_stats.txt
seq $mouse | while read i ; do
[ -d m$i ] || mkdir m$i
curl -o m$i/md5sums $url_m/release_M$i/MD5SUMS
curl -o m$i/stats $url_m/stats/release_M$i.stats.txt
done
seq 10 $human | while read i ; do
[ -d h$i ] || mkdir h$i
curl -o h$i/md5sums $url_h/release_$i/MD5SUMS
curl -o h$i/stats $url_h/stats/release_$i.stats.txt
done
#@
find . -type f | grep md5sums |
awk '{split($1, a, "/"); print substr(a[2],1,1), substr(a[2],2), $1}' |
sort -k 1,1 -k 2n,2 |
awk '{print $3}' |
xargs awk 'BEGIN{
gtf = "gencode\.vM?[1-9][0-9]?\.annotation\.gtf\.gz"
gff3 = "gencode\.vM?[1-9][0-9]?\.annotation\.gff3\.gz"
tx = "gencode\.vM?[1-9][0-9]?\.transcripts\.fa\.gz"
genome = "GRC[mh]3[89]\.primary_assembly\.genome\.fa\.gz"
}
$2 ~ gtf {print $0, FILENAME > "md5sums_gtf" }
$2 ~ gff3 {print $0, FILENAME > "md5sums_gff3" }
$2 ~ tx {print $0, FILENAME > "md5sums_tx" }
$2 ~ genome {print $0, FILENAME > "md5sums_genome"}'
#@
find . -type f | grep stats |
awk '{split($1, a, "/"); print substr(a[2],1,1), substr(a[2],2), $1}' |
sort -k 1,1 -k 2n,2 |
awk '{print $3}' |
xargs awk '{split(FILENAME, a, "/"); print a[2], $0}' > STATS
#@
# GTF
seq $mouse_fr $mouse_to | while read i ; do
[ -d m$i ] || mkdir m$i
curl -o m$i/a.gtf.gz $url_m/release_M$i/gencode.vM$i.annotation.gtf.gz
done
seq $human_fr $human_to | while read i ; do
[ -d h$i ] || mkdir h$i
curl -o h$i/a.gtf.gz $url_h/release_$i/gencode.v$i.annotation.gtf.gz
done
#@
# GFF3
seq $mouse_fr $mouse_to | while read i ; do
[ -d m$i ] || mkdir m$i
curl -o m$i/a.gff3.gz $url_m/release_M$i/gencode.vM$i.annotation.gff3.gz
done
seq $human_fr $human_to | while read i ; do
[ -d h$i ] || mkdir h$i
curl -o h$i/a.gff3.gz $url_h/release_$i/gencode.v$i.annotation.gff3.gz
done
#@
# TRANSCRIPTOME
seq $mouse_fr $mouse_to | while read i ; do
[ -d m$i ] || mkdir m$i
curl -o m$i/t.fa.gz $url_m/release_M$i/gencode.vM$i.transcripts.fa.gz
done
seq $human_fr $human_to | while read i ; do
[ -d h$i ] || mkdir h$i
curl -o h$i/t.fa.gz $url_h/release_$i/gencode.v$i.transcripts.fa.gz
done
#@
# GENOME
seq $mouse_fr $mouse_to | while read i ; do
[ -d m$i ] || mkdir m$i
[ $i -ge 26 ] &&
curl -o m$i/g.fa.gz $url_m/release_M$i/GRCm39.primary_assembly.genome.fa.gz
[ $i -lt 26 ] &&
curl -o m$i/g.fa.gz $url_m/release_M$i/GRCm38.primary_assembly.genome.fa.gz
done
seq $human_fr $human_to | while read i ; do
[ -d h$i ] || mkdir h$i
[ $i -ge 22 ] &&
curl -o h$i/g.fa.gz $url_h/release_$i/GRCh38.primary_assembly.genome.fa.gz
done
########### VALIDATE
#@
error() {
${2+:} false && echo "${0##*/}: $2" 1>&2
exit $1
}
data=a.gtf
list="transcript gene exon CDS"
tab=$(printf "\t")
#@
grep -v '^#' a.gtf |
awk 'BEGIN{FS=OFS="\t"} #
{ #
a1[$1]; a2[$2]; a3[$3] #
a6[$6]; a7[$7]; a8[$8] } #
END { #
for( i in a1) print "1", i #
for( i in a2) print "2", i #
for( i in a3) print "3", i #
for( i in a6) print "6", i #
for( i in a7) print "7", i #
for( i in a8) print "8", i }' |
sort -t "$tab" -k 1n,1 -k 2,2
#@
{
awk -F'\t' '$3=="transcript"' $data | wc -l | sed "s/^/transcript /"
awk -F'\t' '$3=="gene"' $data | wc -l | sed "s/^/gene /"
} | { echo "#ENTRIES_NUM" ; cat ; echo ;}
#@
for i in $list ; do
grep -v "^#" $data |
awk -F'\t' '$3=="'$i'"{ #
n = split($9, a,";") #
for(i=1;i<=n-1;i++) print NR, i, a[i]}' |
awk '{gsub(" "," "); print}' |
tr -d '"' > tmp-$i
done
#@
cat tmp-gene |
awk '$3=="gene_type"{a[$4]++}END{for(i in a) print i, a[i]}' |
sort -f
#@
for i in $list ; do
awk 'NF!=4{exit 1}' tmp-$i
case $? in (0) echo "$i 0"
;;(*) error 1 "$i 1"
;;esac
done | { echo "#WHITESPACE";cat ; echo ;}
#@
for i in $list ; do
awk '!a[$3]++{print $3}' tmp-$i |
sort |
{ echo "#9th-KEYs $i" ; awk '{print NR, $1}' ; echo ;}
done
#@
for i in $list ; do
awk '!a[$2, $3]++ {print $2, $3}' tmp-$i |
sort -k 1n,1 |
awk '{a[NR]=$0 } #
NR!=$1{exit} #
END{if(NR>=3) for(i=1;i<=NR-2; i++) print a[i]}' |
{ echo "#FIXED-POSITIONS $i"; cat ; echo ;}
done
#@
cat $data |
awk -F'\t' '$3=="transcript"{
split($9, a,";")
sub(/ [^ ]*$/,"",a[1])
if(!u[a[1]]++) print a[1] }'