ServerAdministration: decoy_IPI.pl

File decoy_IPI.pl, 8.0 KB (added by fredrik, 3 years ago)

Script to generate random/reverse database entries with prefix

Line 
1#!/usr/local/bin/perl
2
3#############################################################################
4# Modified version of decoy.pl from Matrix Science. Special naming of     #
5# IPI random or reverse. 2006-10-02 Fredrik Levander Lund University      #
6#                     #
7# Copyright (C) 2006, Matrix Science Limited.                               #
8#                                                                           #
9# This script is free software. You can redistribute and/or                 #
10# modify it under the terms of the GNU General Public License               #
11# as published by the Free Software Foundation; either version 2            #
12# of the License or, (at your option), any later version.                   #
13#                                                                           #
14# These modules are distributed in the hope that they will be useful,       #
15# but WITHOUT ANY WARRANTY; without even the implied warranty of            #
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the              #
17# GNU General Public License for more details.                              #
18#############################################################################
19
20  use strict;
21
22# print usage information if no arguments supplied 
23  unless ($ARGV[0]) {
24    print "Usage:\ndecoy.pl [--random] [--append] [--keep_accessions] input.fasta [output.fasta]\n\n";
25    print "If --random is specified, the output entries will be random sequences\n";
26    print "   with the same average amino acid composition as the input database. \n";
27    print "   Otherwise, the output entries will be created by reversing the input\n";
28    print "   sequences, (faster, but not suitable for PMF or no-enzyme searches).\n";
29    print "If --append is specified, the new entries will be appended to the input\n";
30    print "   database. Otherwise, a separate decoy database file will be created.\n";
31    print "If --keep_accessions is specified, the original accession strings will\n";
32    print "   be retained. This is necessary if you want to use taxonomy and the\n";
33    print "   taxonomy is derived from the accessions, (e.g. NCBI gi2taxid).\n";
34    print "   Otherwise, the string ###REV### or ###RND### is prepended to the\n";
35    print "   original accession strings.\n";
36    print "   IPI entries will instead be renamed to IPRND ord IPREV \n";
37    print "You cannot specify both --append and --keep_accessions.\n";
38    print "An output path must be supplied unless --append is specified.\n";
39    exit 1;
40  }
41
42# get arguments and perform basic error checking
43  my($random, $append, $keep_accessions, $inFile, $outFile);
44  for (my $i = 0; $i <= $#ARGV; $i++) {
45    if (lc($ARGV[$i]) eq "--random") {
46      $random = 1;
47    } elsif (lc($ARGV[$i]) eq "--append") {
48      $append = 1;
49    } elsif (lc($ARGV[$i]) eq "--keep_accessions") {
50      $keep_accessions = 1;
51    } elsif ($ARGV[$i] =~ /^-/) {
52      die "Error: unrecognised argument: " . $ARGV[$i];
53    } elsif (!$inFile) {
54      $inFile = $ARGV[$i];
55    } elsif (!$outFile) {
56      $outFile = $ARGV[$i];
57    } else {
58      die "Error: too many arguments";
59    }
60  }
61 
62  unless ($inFile && -s $inFile) {
63    die "Error: must specify valid input file";
64  }
65 
66  unless ($append) {
67    if ($outFile) {
68      if (-e $outFile) {
69        print "Warning: output file already exists. OK to overwrite? [No] ";
70        my $answer = <STDIN>;
71        unless ($answer =~ /^y/i) {
72          exit 1;
73        }
74      }
75    } else {
76      die "Error: must specify output file path";
77    } 
78  }
79 
80  if ($append && $keep_accessions) {
81    die "Error: cannot combine --append and --keep_accessions";
82  }
83 
84# so far so good, try to open input and output files
85  if ($append) {
86    open (INFILE,  "+<$inFile")          || die "Error: cannot open input file";
87    open (OUTFILE, "+>$inFile" . ".tmp") || die "Error: cannot create temp file";
88  } else {
89    open (INFILE,  "<$inFile")           || die "Error: cannot open input file";
90    open (OUTFILE, ">$outFile")          || die "Error: cannot open output file";
91  }
92 
93# use the same EOL for the output as found in the input file
94  binmode INFILE;
95  binmode OUTFILE;
96  $/ = "\012>";
97  my $eol;
98  while (<INFILE>) {
99    if (length($_) > 1) {
100      if (/\015\012/) {
101        $eol = "\015\012"; #DOS
102      } else {
103        $eol = "\012"; #Unix
104      }
105      last;
106    } else {
107      next;
108    }
109  }
110
111# set the cursor to start of file + 1 character, (i.e. skip first ">")
112  seek(INFILE, 1, 0);
113
114# read a complete sequence entry at a time
115  $/ = "$eol>";
116 
117# for --random, need to determine average AA composition of input database
118  my(@AACount, @residue, $denominator);
119  if ($random) {
120    while (<INFILE>) {
121      my($title, $seq) = split(/$eol/o, $_, 2);
122      $_ = uc($seq);
123      $AACount[0] += tr/A//;
124      $AACount[1] += tr/B//;
125      $AACount[2] += tr/C//;
126      $AACount[3] += tr/D//;
127      $AACount[4] += tr/E//;
128      $AACount[5] += tr/F//;
129      $AACount[6] += tr/G//;
130      $AACount[7] += tr/H//;
131      $AACount[8] += tr/I//;
132      $AACount[9] += tr/J//;
133      $AACount[10] += tr/K//;
134      $AACount[11] += tr/L//;
135      $AACount[12] += tr/M//;
136      $AACount[13] += tr/N//;
137      $AACount[14] += tr/O//;
138      $AACount[15] += tr/P//;
139      $AACount[16] += tr/Q//;
140      $AACount[17] += tr/R//;
141      $AACount[18] += tr/S//;
142      $AACount[19] += tr/T//;
143      $AACount[20] += tr/U//;
144      $AACount[21] += tr/V//;
145      $AACount[22] += tr/W//;
146      $AACount[23] += tr/X//;
147      $AACount[24] += tr/Y//;
148      $AACount[25] += tr/Z//;
149    }
150  # $denominator is total number of residues
151    foreach (@AACount) {
152      $denominator += $_;
153    }
154  # populate lookup vector with residues in same proportions
155    for (my $i = 0; $i < 26; $i++) {
156      for (my $j = 0; $j < int($AACount[$i] * 10000 / $denominator + 0.5); $j++) {
157        push @residue, chr(65 + $i);
158      }
159    }
160  # ensure vector fully populated by topping up with common residue A
161    while (scalar(@residue) < 10000) {
162      push @residue, "A";
163    }
164  }
165
166# set the cursor to start of file + 1 character, (i.e. skip first ">")
167  seek(INFILE, 1, 0);
168
169# loop through input file and create the output file
170  while (<INFILE>) {
171    my($title, $seq) = split(/$eol/o, $_, 2);
172    my($accession, $description) = split(/\s+/, $title, 2);
173  # remove any non-residue characters from input sequence
174    $seq =~ tr/a-zA-Z//cd;
175    my @pieces;
176    if ($random) {
177      if ($keep_accessions) {
178        print OUTFILE ">" . $accession . " Random sequence, was " . $description . $eol;
179      } else {
180    if ($accession =~ /IPI:/i){
181    my ($acc, $des) = split (/[\s|]/,$accession,2);
182    $acc =~ s/:IPI/:IPRND/i;
183    print OUTFILE ">" . $acc . " Random sequence, was " . $des . " " . $description . $eol; 
184    } else {
185          print OUTFILE ">###RND###" . $accession . " Random sequence, was " . $description . $eol;
186    }
187      }
188    # create random sequence
189      my $len = length($seq);
190      $seq = "";
191      for (my $i = 0; $i < $len; $i++) {
192        $seq .= $residue[int(rand 10000)];     
193      }
194    # chop into little pieces for output
195      @pieces = $seq =~ /(.{1,60})/g;
196      foreach (@pieces) {
197        print OUTFILE $_ . $eol;
198      }
199    } else {
200      if ($keep_accessions) {
201        print OUTFILE ">" . $accession . " Reverse sequence, was " . $description . $eol;
202      } else {
203        if ($accession =~ /IPI:/i){
204                my ($acc, $des) = split (/[\s|]/,$accession,2);
205                $acc =~ s/:IPI/:IPREV/i;
206                print OUTFILE ">" . $acc . " Reverse sequence, was " . $des . " " . $description . $eol;
207        } else {
208    print OUTFILE ">###REV###" . $accession . " Reverse sequence, was " . $description . $eol;
209        }
210      }
211    # create reversed sequence
212      $seq = reverse $seq;
213    # chop into little pieces for output
214      @pieces = $seq =~ /(.{1,60})/g;
215      foreach (@pieces) {
216        print OUTFILE $_ . $eol;
217      }
218    }
219  }
220
221# if --append, copy output to end of input file and delete temp file
222  if ($append) {
223    seek(OUTFILE, 0, 0);
224    while (<OUTFILE>) {
225        print INFILE $_;     
226    }
227    close OUTFILE;
228    unlink "$inFile" . ".tmp";
229  }
230
231  exit 0;
232