| 1 | #!/usr/local/bin/perl |
|---|
| 2 | |
|---|
| 3 | ############################################################################# |
|---|
| 4 | # Modified version of decoy.pl from Matrix Science. Special naming of # |
|---|
| 5 | # IPI random or reverse. 2006-10-02 Fredrik Levander Lund University # |
|---|
| 6 | # # |
|---|
| 7 | # Copyright (C) 2006, Matrix Science Limited. # |
|---|
| 8 | # # |
|---|
| 9 | # This script is free software. You can redistribute and/or # |
|---|
| 10 | # modify it under the terms of the GNU General Public License # |
|---|
| 11 | # as published by the Free Software Foundation; either version 2 # |
|---|
| 12 | # of the License or, (at your option), any later version. # |
|---|
| 13 | # # |
|---|
| 14 | # These modules are distributed in the hope that they will be useful, # |
|---|
| 15 | # but WITHOUT ANY WARRANTY; without even the implied warranty of # |
|---|
| 16 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # |
|---|
| 17 | # GNU General Public License for more details. # |
|---|
| 18 | ############################################################################# |
|---|
| 19 | |
|---|
| 20 | use strict; |
|---|
| 21 | |
|---|
| 22 | # print usage information if no arguments supplied |
|---|
| 23 | unless ($ARGV[0]) { |
|---|
| 24 | print "Usage:\ndecoy.pl [--random] [--append] [--keep_accessions] input.fasta [output.fasta]\n\n"; |
|---|
| 25 | print "If --random is specified, the output entries will be random sequences\n"; |
|---|
| 26 | print " with the same average amino acid composition as the input database. \n"; |
|---|
| 27 | print " Otherwise, the output entries will be created by reversing the input\n"; |
|---|
| 28 | print " sequences, (faster, but not suitable for PMF or no-enzyme searches).\n"; |
|---|
| 29 | print "If --append is specified, the new entries will be appended to the input\n"; |
|---|
| 30 | print " database. Otherwise, a separate decoy database file will be created.\n"; |
|---|
| 31 | print "If --keep_accessions is specified, the original accession strings will\n"; |
|---|
| 32 | print " be retained. This is necessary if you want to use taxonomy and the\n"; |
|---|
| 33 | print " taxonomy is derived from the accessions, (e.g. NCBI gi2taxid).\n"; |
|---|
| 34 | print " Otherwise, the string ###REV### or ###RND### is prepended to the\n"; |
|---|
| 35 | print " original accession strings.\n"; |
|---|
| 36 | print " IPI entries will instead be renamed to IPRND ord IPREV \n"; |
|---|
| 37 | print "You cannot specify both --append and --keep_accessions.\n"; |
|---|
| 38 | print "An output path must be supplied unless --append is specified.\n"; |
|---|
| 39 | exit 1; |
|---|
| 40 | } |
|---|
| 41 | |
|---|
| 42 | # get arguments and perform basic error checking |
|---|
| 43 | my($random, $append, $keep_accessions, $inFile, $outFile); |
|---|
| 44 | for (my $i = 0; $i <= $#ARGV; $i++) { |
|---|
| 45 | if (lc($ARGV[$i]) eq "--random") { |
|---|
| 46 | $random = 1; |
|---|
| 47 | } elsif (lc($ARGV[$i]) eq "--append") { |
|---|
| 48 | $append = 1; |
|---|
| 49 | } elsif (lc($ARGV[$i]) eq "--keep_accessions") { |
|---|
| 50 | $keep_accessions = 1; |
|---|
| 51 | } elsif ($ARGV[$i] =~ /^-/) { |
|---|
| 52 | die "Error: unrecognised argument: " . $ARGV[$i]; |
|---|
| 53 | } elsif (!$inFile) { |
|---|
| 54 | $inFile = $ARGV[$i]; |
|---|
| 55 | } elsif (!$outFile) { |
|---|
| 56 | $outFile = $ARGV[$i]; |
|---|
| 57 | } else { |
|---|
| 58 | die "Error: too many arguments"; |
|---|
| 59 | } |
|---|
| 60 | } |
|---|
| 61 | |
|---|
| 62 | unless ($inFile && -s $inFile) { |
|---|
| 63 | die "Error: must specify valid input file"; |
|---|
| 64 | } |
|---|
| 65 | |
|---|
| 66 | unless ($append) { |
|---|
| 67 | if ($outFile) { |
|---|
| 68 | if (-e $outFile) { |
|---|
| 69 | print "Warning: output file already exists. OK to overwrite? [No] "; |
|---|
| 70 | my $answer = <STDIN>; |
|---|
| 71 | unless ($answer =~ /^y/i) { |
|---|
| 72 | exit 1; |
|---|
| 73 | } |
|---|
| 74 | } |
|---|
| 75 | } else { |
|---|
| 76 | die "Error: must specify output file path"; |
|---|
| 77 | } |
|---|
| 78 | } |
|---|
| 79 | |
|---|
| 80 | if ($append && $keep_accessions) { |
|---|
| 81 | die "Error: cannot combine --append and --keep_accessions"; |
|---|
| 82 | } |
|---|
| 83 | |
|---|
| 84 | # so far so good, try to open input and output files |
|---|
| 85 | if ($append) { |
|---|
| 86 | open (INFILE, "+<$inFile") || die "Error: cannot open input file"; |
|---|
| 87 | open (OUTFILE, "+>$inFile" . ".tmp") || die "Error: cannot create temp file"; |
|---|
| 88 | } else { |
|---|
| 89 | open (INFILE, "<$inFile") || die "Error: cannot open input file"; |
|---|
| 90 | open (OUTFILE, ">$outFile") || die "Error: cannot open output file"; |
|---|
| 91 | } |
|---|
| 92 | |
|---|
| 93 | # use the same EOL for the output as found in the input file |
|---|
| 94 | binmode INFILE; |
|---|
| 95 | binmode OUTFILE; |
|---|
| 96 | $/ = "\012>"; |
|---|
| 97 | my $eol; |
|---|
| 98 | while (<INFILE>) { |
|---|
| 99 | if (length($_) > 1) { |
|---|
| 100 | if (/\015\012/) { |
|---|
| 101 | $eol = "\015\012"; #DOS |
|---|
| 102 | } else { |
|---|
| 103 | $eol = "\012"; #Unix |
|---|
| 104 | } |
|---|
| 105 | last; |
|---|
| 106 | } else { |
|---|
| 107 | next; |
|---|
| 108 | } |
|---|
| 109 | } |
|---|
| 110 | |
|---|
| 111 | # set the cursor to start of file + 1 character, (i.e. skip first ">") |
|---|
| 112 | seek(INFILE, 1, 0); |
|---|
| 113 | |
|---|
| 114 | # read a complete sequence entry at a time |
|---|
| 115 | $/ = "$eol>"; |
|---|
| 116 | |
|---|
| 117 | # for --random, need to determine average AA composition of input database |
|---|
| 118 | my(@AACount, @residue, $denominator); |
|---|
| 119 | if ($random) { |
|---|
| 120 | while (<INFILE>) { |
|---|
| 121 | my($title, $seq) = split(/$eol/o, $_, 2); |
|---|
| 122 | $_ = uc($seq); |
|---|
| 123 | $AACount[0] += tr/A//; |
|---|
| 124 | $AACount[1] += tr/B//; |
|---|
| 125 | $AACount[2] += tr/C//; |
|---|
| 126 | $AACount[3] += tr/D//; |
|---|
| 127 | $AACount[4] += tr/E//; |
|---|
| 128 | $AACount[5] += tr/F//; |
|---|
| 129 | $AACount[6] += tr/G//; |
|---|
| 130 | $AACount[7] += tr/H//; |
|---|
| 131 | $AACount[8] += tr/I//; |
|---|
| 132 | $AACount[9] += tr/J//; |
|---|
| 133 | $AACount[10] += tr/K//; |
|---|
| 134 | $AACount[11] += tr/L//; |
|---|
| 135 | $AACount[12] += tr/M//; |
|---|
| 136 | $AACount[13] += tr/N//; |
|---|
| 137 | $AACount[14] += tr/O//; |
|---|
| 138 | $AACount[15] += tr/P//; |
|---|
| 139 | $AACount[16] += tr/Q//; |
|---|
| 140 | $AACount[17] += tr/R//; |
|---|
| 141 | $AACount[18] += tr/S//; |
|---|
| 142 | $AACount[19] += tr/T//; |
|---|
| 143 | $AACount[20] += tr/U//; |
|---|
| 144 | $AACount[21] += tr/V//; |
|---|
| 145 | $AACount[22] += tr/W//; |
|---|
| 146 | $AACount[23] += tr/X//; |
|---|
| 147 | $AACount[24] += tr/Y//; |
|---|
| 148 | $AACount[25] += tr/Z//; |
|---|
| 149 | } |
|---|
| 150 | # $denominator is total number of residues |
|---|
| 151 | foreach (@AACount) { |
|---|
| 152 | $denominator += $_; |
|---|
| 153 | } |
|---|
| 154 | # populate lookup vector with residues in same proportions |
|---|
| 155 | for (my $i = 0; $i < 26; $i++) { |
|---|
| 156 | for (my $j = 0; $j < int($AACount[$i] * 10000 / $denominator + 0.5); $j++) { |
|---|
| 157 | push @residue, chr(65 + $i); |
|---|
| 158 | } |
|---|
| 159 | } |
|---|
| 160 | # ensure vector fully populated by topping up with common residue A |
|---|
| 161 | while (scalar(@residue) < 10000) { |
|---|
| 162 | push @residue, "A"; |
|---|
| 163 | } |
|---|
| 164 | } |
|---|
| 165 | |
|---|
| 166 | # set the cursor to start of file + 1 character, (i.e. skip first ">") |
|---|
| 167 | seek(INFILE, 1, 0); |
|---|
| 168 | |
|---|
| 169 | # loop through input file and create the output file |
|---|
| 170 | while (<INFILE>) { |
|---|
| 171 | my($title, $seq) = split(/$eol/o, $_, 2); |
|---|
| 172 | my($accession, $description) = split(/\s+/, $title, 2); |
|---|
| 173 | # remove any non-residue characters from input sequence |
|---|
| 174 | $seq =~ tr/a-zA-Z//cd; |
|---|
| 175 | my @pieces; |
|---|
| 176 | if ($random) { |
|---|
| 177 | if ($keep_accessions) { |
|---|
| 178 | print OUTFILE ">" . $accession . " Random sequence, was " . $description . $eol; |
|---|
| 179 | } else { |
|---|
| 180 | if ($accession =~ /IPI:/i){ |
|---|
| 181 | my ($acc, $des) = split (/[\s|]/,$accession,2); |
|---|
| 182 | $acc =~ s/:IPI/:IPRND/i; |
|---|
| 183 | print OUTFILE ">" . $acc . " Random sequence, was " . $des . " " . $description . $eol; |
|---|
| 184 | } else { |
|---|
| 185 | print OUTFILE ">###RND###" . $accession . " Random sequence, was " . $description . $eol; |
|---|
| 186 | } |
|---|
| 187 | } |
|---|
| 188 | # create random sequence |
|---|
| 189 | my $len = length($seq); |
|---|
| 190 | $seq = ""; |
|---|
| 191 | for (my $i = 0; $i < $len; $i++) { |
|---|
| 192 | $seq .= $residue[int(rand 10000)]; |
|---|
| 193 | } |
|---|
| 194 | # chop into little pieces for output |
|---|
| 195 | @pieces = $seq =~ /(.{1,60})/g; |
|---|
| 196 | foreach (@pieces) { |
|---|
| 197 | print OUTFILE $_ . $eol; |
|---|
| 198 | } |
|---|
| 199 | } else { |
|---|
| 200 | if ($keep_accessions) { |
|---|
| 201 | print OUTFILE ">" . $accession . " Reverse sequence, was " . $description . $eol; |
|---|
| 202 | } else { |
|---|
| 203 | if ($accession =~ /IPI:/i){ |
|---|
| 204 | my ($acc, $des) = split (/[\s|]/,$accession,2); |
|---|
| 205 | $acc =~ s/:IPI/:IPREV/i; |
|---|
| 206 | print OUTFILE ">" . $acc . " Reverse sequence, was " . $des . " " . $description . $eol; |
|---|
| 207 | } else { |
|---|
| 208 | print OUTFILE ">###REV###" . $accession . " Reverse sequence, was " . $description . $eol; |
|---|
| 209 | } |
|---|
| 210 | } |
|---|
| 211 | # create reversed sequence |
|---|
| 212 | $seq = reverse $seq; |
|---|
| 213 | # chop into little pieces for output |
|---|
| 214 | @pieces = $seq =~ /(.{1,60})/g; |
|---|
| 215 | foreach (@pieces) { |
|---|
| 216 | print OUTFILE $_ . $eol; |
|---|
| 217 | } |
|---|
| 218 | } |
|---|
| 219 | } |
|---|
| 220 | |
|---|
| 221 | # if --append, copy output to end of input file and delete temp file |
|---|
| 222 | if ($append) { |
|---|
| 223 | seek(OUTFILE, 0, 0); |
|---|
| 224 | while (<OUTFILE>) { |
|---|
| 225 | print INFILE $_; |
|---|
| 226 | } |
|---|
| 227 | close OUTFILE; |
|---|
| 228 | unlink "$inFile" . ".tmp"; |
|---|
| 229 | } |
|---|
| 230 | |
|---|
| 231 | exit 0; |
|---|
| 232 | |
|---|