New Upstream Snapshot - rdp-alignment

Ready changes

Summary

Merged new upstream version: 1.2.0+git20201229.1.32bb2e3 (was: 1.2.0).

Resulting package

Built on 2023-01-19T16:17 (took 8m21s)

The resulting binary packages can be installed (if you have the apt repository enabled) by running one of:

apt install -t fresh-snapshots rdp-alignment

Lintian Result

Diff

diff --git a/.gitignore b/.gitignore
deleted file mode 100644
index fa6ecd3..0000000
--- a/.gitignore
+++ /dev/null
@@ -1,9 +0,0 @@
-.svn
-build
-*.so
-*.dylib
-dist
-*~
-#*#
-lib/
-private/
diff --git a/build.xml b/build.xml
index c5a26cb..ea5bffd 100644
--- a/build.xml
+++ b/build.xml
@@ -84,7 +84,7 @@
 
         <mkdir dir="${ivy.jar.dir}"/>
         <!-- download Ivy from web site so that it can be used even without any special installation -->
-        <get src="http://repo2.maven.org/maven2/org/apache/ivy/ivy/${ivy.install.version}/ivy-${ivy.install.version}.jar" 
+        <get src="http://insecure.repo1.maven.org/maven2/org/apache/ivy/ivy/${ivy.install.version}/ivy-${ivy.install.version}.jar" 
              dest="${ivy.jar.file}" usetimestamp="true"/>
     </target>
 
diff --git a/debian/changelog b/debian/changelog
index 9ad705d..1484f85 100644
--- a/debian/changelog
+++ b/debian/changelog
@@ -1,3 +1,9 @@
+rdp-alignment (1.2.0+git20201229.1.32bb2e3-1) UNRELEASED; urgency=low
+
+  * New upstream snapshot.
+
+ -- Debian Janitor <janitor@jelmer.uk>  Thu, 19 Jan 2023 16:12:19 -0000
+
 rdp-alignment (1.2.0-8) unstable; urgency=medium
 
   * Fix watch file
diff --git a/ivysettings.xml b/ivysettings.xml
new file mode 100644
index 0000000..0785934
--- /dev/null
+++ b/ivysettings.xml
@@ -0,0 +1,9 @@
+<ivysettings>
+    <settings defaultResolver="mainchain" />
+	<resolvers>
+		<chain name="mainchain">
+			<ibiblio name="public" m2compatible="true" />
+			<ibiblio name="maven" m2compatible="true" root="http://insecure.repo1.maven.org/maven2/" pattern="[organisation]/[module]/[revision]/[artifact]-[revision].[ext]" />
+		</chain>
+	</resolvers>
+</ivysettings>
diff --git a/jni/wrapper b/jni/wrapper
deleted file mode 100755
index 0294e87..0000000
Binary files a/jni/wrapper and /dev/null differ
diff --git a/src/edu/msu/cme/rdp/alignment/pairwise/PairwiseAlignment.java b/src/edu/msu/cme/rdp/alignment/pairwise/PairwiseAlignment.java
index d050c47..67cfc8c 100644
--- a/src/edu/msu/cme/rdp/alignment/pairwise/PairwiseAlignment.java
+++ b/src/edu/msu/cme/rdp/alignment/pairwise/PairwiseAlignment.java
@@ -30,6 +30,7 @@ public class PairwiseAlignment {
     private List<Integer> scores;
     private int starti, endi;
     private int startj, endj;
+    private double ident = Double.NaN;
 
     public PairwiseAlignment(String alignedSeqi, String alignedSeqj, List<Integer> scores, int starti, int endi, int startj, int endj) {
         this.alignedSeqi = alignedSeqi;
@@ -73,4 +74,12 @@ public class PairwiseAlignment {
     public int getStartj() {
         return startj;
     }
+    
+    public double getIdent(){
+        return ident;
+    }
+    
+    public void setIdent(double i){
+        ident = i;
+    }
 }
diff --git a/src/edu/msu/cme/rdp/alignment/pairwise/PairwiseKNN.java b/src/edu/msu/cme/rdp/alignment/pairwise/PairwiseKNN.java
index c1aa9f4..fec1f75 100644
--- a/src/edu/msu/cme/rdp/alignment/pairwise/PairwiseKNN.java
+++ b/src/edu/msu/cme/rdp/alignment/pairwise/PairwiseKNN.java
@@ -30,14 +30,21 @@ import edu.msu.cme.rdp.readseq.utils.kmermatch.KmerMatchCore;
 import edu.msu.cme.rdp.readseq.utils.kmermatch.NuclSeqMatch;
 import edu.msu.cme.rdp.readseq.utils.kmermatch.ProteinSeqMatch;
 import edu.msu.cme.rdp.readseq.utils.orientation.GoodWordIterator;
+import edu.msu.cme.rdp.readseq.utils.orientation.OrientationChecker;
 import edu.msu.cme.rdp.readseq.utils.orientation.ProteinWordGenerator;
 import java.io.File;
 import java.io.IOException;
 import java.io.PrintStream;
 import java.util.ArrayList;
+import java.util.Collection;
 import java.util.Collections;
 import java.util.Comparator;
+import java.util.HashMap;
 import java.util.List;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.atomic.AtomicInteger;
 import org.apache.commons.cli.CommandLine;
 import org.apache.commons.cli.HelpFormatter;
 import org.apache.commons.cli.Options;
@@ -49,21 +56,42 @@ import org.apache.commons.cli.PosixParser;
  */
 public class PairwiseKNN {
 
-    private File queryFile;
-    private File refFile;
-    private int k;  
-    private int prefilter = 0;
+    private final File refFile;
+    private final int k;  // final number of hits to return
+    private final int prefilter;  // number of hits to keep from prefilter stages
     private int wordSize;
-    private  AlignmentMode mode;
-    private List<Sequence> dbSeqs;
-    private PrintStream out;
+    private final AlignmentMode mode;
+    private final HashMap<String, Sequence> dbSeqsMap = new HashMap(); // keep all the refseq in memory for pairwise alignment
+    private final ScoringMatrix matrix;
+    private KmerMatchCore kerMatchCore;
     private static final String dformat = "%1$.3f";
+    private static final DistanceModel dist = new IdentityDistanceModel();
+    private static final Comparator c = new ResultComparator();
+    private final SequenceType refSeqType ;
     
     public static class Neighbor {
 
         PairwiseAlignment alignment;
         boolean reverse;
         Sequence dbSeq;
+        
+        public boolean isReverse(){
+            return reverse;
+        }
+        
+        public PairwiseAlignment getAlignment(){
+            return alignment;
+        }
+        
+        public Sequence getDbSeq(){
+            return dbSeq;
+        }
+    }
+    
+    public static class ResultComparator implements Comparator<Neighbor> {
+        public int compare(Neighbor t, Neighbor t1) {
+            return t.alignment.getScore() - t1.alignment.getScore();
+        }
     }
 
     private static <T> void insert(T n, List<T> list, Comparator<T> comp, int k) {
@@ -80,148 +108,185 @@ public class PairwiseKNN {
         }
     }
 
-    public static List<Neighbor> getKNN(Sequence query, List<Sequence> dbSeqs, AlignmentMode mode, int k, int wordSize, int prefilter) throws IOException {
-        List<Neighbor> ret = new ArrayList();
-        Neighbor n;
-        Comparator c = new Comparator<Neighbor>() {
-            public int compare(Neighbor t, Neighbor t1) {
-                return t.alignment.getScore() - t1.alignment.getScore();
+   
+    public PairwiseKNN( File refFile, AlignmentMode mode, int k, int ws, int prefilter) throws IOException{
+        this.refFile = refFile;
+        this.mode = mode;
+        this.k = k;
+        this.prefilter = prefilter;
+        this.wordSize = ws;
+        
+        refSeqType = SeqUtils.guessSequenceType(refFile);
+        parseRefSeq(refFile);
+        if ( refSeqType == SequenceType.Protein){
+            matrix = ScoringMatrix.getDefaultProteinMatrix();
+            if ( wordSize == 0 ){
+                this.wordSize = ProteinWordGenerator.WORDSIZE;
+            }
+            if ( prefilter > 0){
+                kerMatchCore = new ProteinSeqMatch(new ArrayList<Sequence>(dbSeqsMap.values()), wordSize);
             }
-        };
-
-        SequenceType seqType = SeqUtils.guessSequenceType(query);
-        ScoringMatrix matrix;
-        KmerMatchCore kerMatchCore;
-        if (seqType == SequenceType.Nucleotide) {
-            matrix = ScoringMatrix.getDefaultNuclMatrix();
-            kerMatchCore = new NuclSeqMatch(dbSeqs, wordSize); 
         } else {
-            matrix = ScoringMatrix.getDefaultProteinMatrix();
-            kerMatchCore = new ProteinSeqMatch(dbSeqs, wordSize); 
-        }
-
-        List<Sequence> refList;
-       
-        if ( prefilter == 0) {   // do not pre-filter the reference seqs
-            refList = dbSeqs;
-        }else {
-            refList = new ArrayList<Sequence>();
-            ArrayList<ProteinSeqMatch.BestMatch> topKMatches= kerMatchCore.findTopKMatch(query, prefilter);
-            for (KmerMatchCore.BestMatch bestTarget : topKMatches) {
-                refList.add(bestTarget.getBestMatch());
+            matrix = ScoringMatrix.getDefaultNuclMatrix();
+            if ( wordSize == 0 ){
+                this.wordSize = GoodWordIterator.DEFAULT_WORDSIZE ;
             }
+            if ( prefilter > 0){
+                kerMatchCore = new NuclSeqMatch(new ArrayList<Sequence>(dbSeqsMap.values()), wordSize);
+            }
+        }
+    }
+     
+    private synchronized void parseRefSeq(File file) throws IOException{
+        SeqReader reader = new SequenceReader(file);
+        Sequence seq;
+        while ((seq = reader.readNextSequence()) != null) {
+            dbSeqsMap.put(seq.getSeqName(), seq);
+        }
+        reader.close();
+    }
+    
+    public Sequence getRefSeq(String seqName){
+        return this.dbSeqsMap.get(seqName);
+    }
+    
+    public String getRefFilename(){
+        return this.refFile.getName();
+    }
+    
+    public int getK(){
+        return k;
+    }
+    /**
+     * 
+     * @param seq
+     * @param refList, allow different reference set for flexibility
+     * @param isSeqReversed indicates the orientation of the sequence compared to the original seq
+     * @param checkReverse for bacteria and archaea, we can use OrientationChecker, for other sequences, we should provide option
+     * @return
+     * @throws IOException
+     * @throws OverlapCheckFailedException 
+     */
+    public List<Neighbor> getKNN(Sequence seq, Collection<Sequence> refList, boolean removeBaseN, boolean isSeqReversed, boolean checkReverse) throws IOException, OverlapCheckFailedException {
+        List<Neighbor> ret = new ArrayList();
+        Neighbor n;
+        if ( removeBaseN){
+            Sequence temp = new Sequence(seq.getSeqName(), seq.getDesc(), seq.getSeqString().toUpperCase().replace("N", ""));
+            seq = temp;
         }
         
         for (Sequence dbSeq : refList) {
             n = new Neighbor();
             n.dbSeq = dbSeq;
-            PairwiseAlignment fwd = PairwiseAligner.align(n.dbSeq.getSeqString(), query.getSeqString(), matrix, mode);
-            if (seqType == SequenceType.Nucleotide) {
-                PairwiseAlignment rc = PairwiseAligner.align(n.dbSeq.getSeqString(), IUBUtilities.reverseComplement(query.getSeqString()), matrix, mode);
-
+            PairwiseAlignment fwd = PairwiseAligner.align(n.dbSeq.getSeqString(), seq.getSeqString(), matrix, mode);
+            if (refSeqType == SequenceType.Nucleotide &&  checkReverse) {
+                            
+                PairwiseAlignment rc = PairwiseAligner.align(n.dbSeq.getSeqString(), IUBUtilities.reverseComplement(seq.getSeqString()), matrix, mode);
                 if (rc.getScore() > fwd.getScore()) {
                     n.alignment = rc;
-                    n.reverse = true;
+                    n.reverse = isSeqReversed ? false :true;
+                    double ident = 1 - dist.getDistance(rc.getAlignedSeqi().getBytes(), rc.getAlignedSeqj().getBytes(), 0);
+                    rc.setIdent(ident);
                 } else {
                     n.alignment = fwd;
-                    n.reverse = false;
+                    n.reverse = isSeqReversed ? true : false;
+                    double ident = 1 - dist.getDistance(fwd.getAlignedSeqi().getBytes(), fwd.getAlignedSeqj().getBytes(), 0);
+                    fwd.setIdent(ident);
                 }
+               
             } else {
+                double ident = 1 - dist.getDistance(fwd.getAlignedSeqi().getBytes(), fwd.getAlignedSeqj().getBytes(), 0);
+                fwd.setIdent(ident);
                 n.alignment = fwd;
-                n.reverse = false;
+                n.reverse = isSeqReversed;
             }
 
             insert(n, ret, c, k);
         }
                     
         return ret;
-    }
-
-    public PairwiseKNN(File queryFile, File refFile, PrintStream out, AlignmentMode mode, int k, int wordSize, int prefilter) throws IOException{
-        this.queryFile = queryFile;
-        this.refFile = refFile;
-        this.out = out;
-        this.mode = mode;
-        this.k = k;
-        this.prefilter = prefilter;
-        this.wordSize = wordSize;
-        SequenceType querySeqType = SeqUtils.guessSequenceType(queryFile);
-        SequenceType refSeqType = SeqUtils.guessSequenceType(refFile);
-
-        if ( querySeqType !=  refSeqType) {
-            throw new RuntimeException("reference seqs and query seqs must be the same type, either protein or nucleotide. " );
-        }
-        if ( wordSize == 0 ){
-            if ( refSeqType == SequenceType.Protein){
-                this.wordSize = ProteinWordGenerator.WORDSIZE;
-            } else {
-                this.wordSize = GoodWordIterator.DEFAULT_WORDSIZE ;
+    }  
+    
+    public List<Neighbor> findMatch(Sequence seq, boolean removeBaseN) throws IOException, OverlapCheckFailedException {
+        boolean isReversed = false;
+        if ( this.refSeqType == SequenceType.Nucleotide ){
+            //check orientation
+            isReversed = OrientationChecker.getChecker().isSeqReversed(seq.getSeqString());
+            if ( isReversed ){
+                seq = new Sequence(seq.getSeqName(), seq.getDesc(), IUBUtilities.reverseComplement(seq.getSeqString()));
             }
         }
-            
-        dbSeqs = SequenceReader.readFully(refFile);
+        if ( prefilter == 0) {   // do not pre-filter the reference seqs, so need to check both orientation
+            return getKNN(seq, dbSeqsMap.values(), removeBaseN, isReversed, true);
+        }else {
+            List<Sequence> refList = new ArrayList<Sequence>();
+            ArrayList<ProteinSeqMatch.BestMatch> topKMatches= kerMatchCore.findTopKMatch(seq, prefilter);
+                        
+            for (KmerMatchCore.BestMatch bestTarget : topKMatches) {
+                refList.add(bestTarget.getBestMatch());
+            }
+            return getKNN(seq, refList, removeBaseN, isReversed, false);
+        }        
     }
     
-    public void match() throws IOException, OverlapCheckFailedException {
-        DistanceModel dist = new IdentityDistanceModel();
-
-        out.println("#query file: " + queryFile.getName() + " db file: " + refFile.getName() + " k: " + k + " mode: " + mode + " usePrefilter: " + prefilter);
-        out.println("#seqname\tk\tref seqid\tref desc\torientation\tscore\tident\tquery start\tquery end\tquery length\tref start\tref end");
-        Sequence seq;
-        List<Neighbor> alignments;
+    private synchronized void printAlignment(Sequence seq, List<Neighbor> alignments, PrintStream out) throws IOException{
         Neighbor n;
         PairwiseAlignment alignment;
-        SequenceReader queryReader = new SequenceReader(queryFile);
-        while ((seq = queryReader.readNextSequence()) != null) {
-            alignments = getKNN(seq, dbSeqs, mode, k, wordSize, prefilter);
+        for (int index = 0; index < alignments.size(); index++) {
+            n = alignments.get(index);
+            alignment = n.alignment;
 
-            for (int index = 0; index < alignments.size(); index++) {
-                n = alignments.get(index);
-                alignment = n.alignment;
-                double ident = 1 - dist.getDistance(alignment.getAlignedSeqi().getBytes(), alignment.getAlignedSeqj().getBytes(), 0);
+            out.println("@" + seq.getSeqName()
+                    + "\t" + (index + 1)
+                    + "\t" + (n.reverse ? "-" : "+")
+                    + "\t" + alignment.getScore()
+                    + "\t" + String.format(dformat,alignment.getIdent())
+                    + "\t" + alignment.getStartj()
+                    + "\t" + alignment.getEndj()
+                    + "\t" + seq.getSeqString().length()
+                    + "\t" + alignment.getStarti()
+                    + "\t" + alignment.getEndi()
+                    + "\t" + n.dbSeq.getSeqName()
+                    + "\t" + n.dbSeq.getDesc());
 
-                out.println("@" + seq.getSeqName()
-                        + "\t" + (index + 1)
-                        + "\t" + n.dbSeq.getSeqName()
-                        + "\t" + n.dbSeq.getDesc()
-                        + "\t" + (n.reverse ? "-" : "+")
-                        + "\t" + alignment.getScore()
-                        + "\t" + String.format(dformat,ident)
-                        + "\t" + alignment.getStartj()
-                        + "\t" + alignment.getEndj()
-                        + "\t" + seq.getSeqString().length()
-                        + "\t" + alignment.getStarti()
-                        + "\t" + alignment.getEndi());
-
-                out.println(">" + alignment.getAlignedSeqj());
-                out.println(">" + alignment.getAlignedSeqi());
-            }
+            out.println(">" + alignment.getAlignedSeqj());
+            out.println(">" + alignment.getAlignedSeqi());
         }
-        queryReader.close();
-        out.close();
     }
-    
+   
     public static void main(String[] args) throws Exception { 
+        final int maxThreads;
+        final int maxTasks = 1000;
         File queryFile;
         File refFile;
         AlignmentMode mode = AlignmentMode.glocal;
         int k = 1;
         int wordSize = 0 ;
         int prefilter = 10 ;  //  The top p closest protein targets
-        PrintStream out = new PrintStream(System.out);
+        final boolean removeBaseN;
+        final PrintStream out ;
 
         Options options = new Options();
-        options.addOption("m", "mode", true, "Alignment mode {global, glocal, local, overlap, overlap_trimmed} (default= glocal)");
+        options.addOption("m", "mode", true, "Alignment mode {global, glocal, local, overlap, overlap_trim} (default= glocal)");
         options.addOption("k", true, "K-nearest neighbors to return. (default = 1)");
         options.addOption("o", "out", true, "Redirect output to file instead of stdout");
         options.addOption("p", "prefilter", true, "The top p closest targets from kmer prefilter step. Set p=0 to disable the prefilter step. (default = 10) ");
         options.addOption("w", "word-size", true, "The word size used to find closest targets during prefilter. (default " + ProteinWordGenerator.WORDSIZE 
                 + " for protein, " + GoodWordIterator.DEFAULT_WORDSIZE  + " for nucleotide)");
-        
+        options.addOption("n", false, "Remove Ns from the query. Default is false");
+        options.addOption("t", "threads", true, "#Threads to use. This process is CPU intensive. (default 1)");
 
         try {
             CommandLine line = new PosixParser().parse(options, args);
 
+            if (line.hasOption("threads")) {
+                maxThreads = Integer.valueOf(line.getOptionValue("threads"));
+                if ( maxThreads >= Runtime.getRuntime().availableProcessors()) {
+                   System.err.println(" Runtime.getRuntime().availableProcessors() " + Runtime.getRuntime().availableProcessors()); 
+                }                
+            } else {
+                maxThreads = 1;
+            }
             if (line.hasOption("mode")) {
                 mode = AlignmentMode.valueOf(line.getOptionValue("mode"));
             }
@@ -249,8 +314,14 @@ public class PairwiseKNN {
             
             if (line.hasOption("out")) {
                 out = new PrintStream(line.getOptionValue("out"));
+            }else {
+                out = new PrintStream(System.out);
+            }
+            if (line.hasOption('n')) {
+                removeBaseN = true;
+            }else {
+                removeBaseN = false;
             }
-
             args = line.getArgs();
 
             if (args.length != 2) {
@@ -266,9 +337,45 @@ public class PairwiseKNN {
             return;
         }
         
-        PairwiseKNN theObj = new PairwiseKNN(queryFile, refFile, out, mode, k, wordSize, prefilter);
-        theObj.match();
+        SequenceType querySeqType = SeqUtils.guessSequenceType(queryFile);
+        SequenceType refSeqType = SeqUtils.guessSequenceType(refFile);
+
+        if ( querySeqType !=  refSeqType) {
+            throw new RuntimeException("reference seqs and query seqs must be the same type, either protein or nucleotide. " );
+        }
+        final PairwiseKNN theObj = new PairwiseKNN( refFile, mode, k, wordSize, prefilter);
         
+        final AtomicInteger outstandingTasks = new AtomicInteger();        
+        ExecutorService service = Executors.newFixedThreadPool(maxThreads);
+        out.println("#query file: " + queryFile.getName() + " db file: " + refFile.getName() + " k: " + k + " mode: " + mode + " usePrefilter: " + prefilter);
+        out.println("#seqname\tk\torientation\tscore\tident\tquery_start\tquery_end\tquery_length\tref_start\tref_end\tref_seqid\tref_desc");
+                
+        SequenceReader queryReader = new SequenceReader(queryFile);
+        Sequence seq;        
+        while ( (seq = queryReader.readNextSequence()) !=null){            
+            final Sequence threadSeq = seq;
 
+            Runnable r = new Runnable() {
+                public void run() {
+                    try {
+                        List<Neighbor> alignments = theObj.findMatch(threadSeq, removeBaseN);
+                        theObj.printAlignment(threadSeq, alignments, out);
+                        outstandingTasks.decrementAndGet();
+                    } catch (Exception e) {
+                        e.printStackTrace();
+                    }
+                }
+            };
+
+            outstandingTasks.incrementAndGet();
+            service.submit(r);
+
+            while (outstandingTasks.get() >= maxTasks);   
+        }
+        
+        service.shutdown();
+        service.awaitTermination(1, TimeUnit.DAYS);        
+        queryReader.close();
+        out.close();        
     }
 }

Debdiff

[The following lists of changes regard files as different if they have different names, permissions or owners.]

Files in second set of .debs but not in first

-rw-r--r--  root/root   /usr/share/java/rdp-alignment-1.2.0+git20201229.1.32bb2e3.jar
lrwxrwxrwx  root/root   /usr/share/java/rdp-alignment.jar -> rdp-alignment-1.2.0+git20201229.1.32bb2e3.jar

Files in first set of .debs but not in second

-rw-r--r--  root/root   /usr/share/java/rdp-alignment-1.2.0.jar
lrwxrwxrwx  root/root   /usr/share/java/rdp-alignment.jar -> rdp-alignment-1.2.0.jar

No differences were encountered in the control files

More details

Full run details