Codebase list augustus / 984ef698-c606-4157-9207-1d9b10a870de/main src / alignment.cc
984ef698-c606-4157-9207-1d9b10a870de/main

Tree @984ef698-c606-4157-9207-1d9b10a870de/main (Download .tar.gz)

alignment.cc @984ef698-c606-4157-9207-1d9b10a870de/mainraw · history · blame

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
/*
 * alignment.cc
 *
 * License: Artistic License, see file LICENSE.TXT or 
 *          https://opensource.org/licenses/artistic-license-1.0
 */

#include "alignment.hh"
#include <string>
#include <iomanip>
#include <queue>
#include <climits>
#include <bitset>
#include <set>
#include <tuple>
#include "lp_lib.h"

using namespace std;

/*
 * constructur using chromosomal position of first nucleotide and row buffer as
 * read from the last column of .maf file
 */
AlignmentRow::AlignmentRow(string seqID, int chrPos, Strand strand, string rowstr) : seqID(seqID), strand(strand), cumFragLen(0) {
    int aliPos = 0;
    int len;
    int i = 0; 
    int alilen = rowstr.size();

    while(i < alilen){
	while (isGap(rowstr[i])){
	    aliPos++;
	    i++;
	}
	len = 0;
	while (i < alilen && !isGap(rowstr[i])){
	    aliPos++;
	    chrPos++;
	    len++;
	    i++;
	}
	if (len>0)
	    addFragment(chrPos - len, aliPos - len, len);
    }
    // row consists of just gaps
    if (frags.empty()){
	addFragment(chrPos, 0, 0); // add a single fragment of length 0 to start of alignment
    }
}

int AlignmentRow::chrStart() const {
    if (frags.empty())
	return -1;
    return frags[0].chrPos;
}

int AlignmentRow::chrEnd() const{
    if (frags.empty()) // this only happens temporarily during appendRow
	return -1;
    fragment last = frags[frags.size()-1];
    return last.chrPos + last.len - 1;
}

int AlignmentRow::aliEnd() const{
    if (frags.empty())
	return -1;
    fragment last = frags[frags.size()-1];
    return last.aliPos + last.len - 1;
}

void AlignmentRow::addFragment(int chrPos, int aliPos, int len){
    frags.push_back(fragment(chrPos, aliPos, len));
    cumFragLen += len;
}

void AlignmentRow::pack(){
    int n = frags.size();
    if (n <= 1)
	return;
    int i=0, // index of next fragment to extend if appropriate
	j=1; // j>i index of next fragment to be extended with if appropriate
    while (j < n){
	// check whether the i-th and j-th fragment can be joined to a single gapless one
	// neither a gap in the alignment nor in the chromosome
	if (frags[i].aliPos + frags[i].len == frags[j].aliPos && frags[i].chrPos + frags[i].len == frags[j].chrPos){
	    frags[i].len += frags[j].len;
	    j++;
	} else {
	    i++;
	    if (i < j)
		frags[i] = frags[j];
	    j++;
	}
    }
    // if successful in packing, above loop leaves a gap at the end of the 'frags' vector, close it
    frags.resize(i+1);
}

// simple left-to-right search starting from given fragment 'from'
// efficient, when many 'chrPos' are searched in left-to-right order
int AlignmentRow::getAliPos(int chrPos, vector<fragment>::const_iterator from){
    if (from == frags.end() || from->chrPos > chrPos) // chrPos the the left of alignment
	return -2;
    while (from != frags.end() && from->chrPos + from->len - 1 < chrPos)
	++from;
    if (from == frags.end())
	return -1;
    // chr          | chrPos 
    //       from->chrPos         
    //         |----------------------|
    // ali   from->aliPos
    if (chrPos < from->chrPos) // chrPos falls in an aligment gap
	return -1;
    return from->aliPos + chrPos - from->chrPos;
}

//same as above, but lets you pass the iterator, which makes multiple use possible
int AlignmentRow::getAliPos(int chrPos, vector<fragment>::const_iterator *from){
    if (*from == frags.end() || (*from)->chrPos > chrPos)
	return -2;
    while (*from != frags.end() && (*from)->chrPos + (*from)->len - 1 < chrPos)
	++*from;
    if (*from == frags.end())
	return -1;
    if (chrPos < (*from)->chrPos)
	return -1;
    return (*from)->aliPos + chrPos - (*from)->chrPos;
}

// convert from alignment to chromosomal position (inverse function of getAliPos()) 
int AlignmentRow::getChrPos(int aliPos, vector<fragment>::const_iterator from){
    if (from == frags.end() || from->aliPos > aliPos) // aliPos to the left of alignment
        return -2;
    while (from != frags.end() && from->aliPos + from->len - 1 < aliPos)
        ++from;
    if (from == frags.end())
        return -1;
    if (aliPos < from->aliPos) // aliPos falls in an aligment gap
        return -1;
    return from->chrPos + aliPos - from->aliPos;
}

/**
 * append row r2 to r1, thereby shifting all alignment coordinates of r2 by aliLen1
 * if signature string (chr and strand) is not empty, treat everything else as missing
 */
void appendRow(AlignmentRow **r1, const AlignmentRow *r2, int aliLen1, string sigstr){
    if (sigstr != "" && *r1 && (*r1)->getSignature() != sigstr)
	*r1 = NULL; // delete left alignment row if not consistent with signature
    if (*r1 == NULL && r2 != NULL && (sigstr == "" || sigstr == r2->getSignature())){
	*r1 = new AlignmentRow();
	(*r1)->seqID = r2->seqID;
    (*r1)->strand = r2->strand;


    #ifdef TESTING
	(*r1)->seqIDarchive = r2->seqIDarchive;
	(*r1)->chrLen = r2->chrLen;				
    #endif

    }

    
    if ((*r1) && r2 &&
	((*r1)->seqID != r2->seqID || r2->chrStart() <= (*r1)->chrEnd())){
	// incomplatible sequence IDs or second row does not come after first row,
	// use the fragments with the longer chromosomal range and throw away the other alignment row
	// unless the sigstr parameter breaks the tie
	if ((*r1)->getSeqLen() >= r2->getSeqLen() || (sigstr != "" && sigstr != r2->getSignature()))
	    return; // implicitly delete the fragments of the second row r2
	else {
	    (*r1)->frags.clear(); // delete the fragments of the first row r1
	    (*r1)->seqID = r2->seqID;
	    (*r1)->strand = r2->strand;
	    (*r1)->cumFragLen = 0;

        #ifdef TESTING
    	(*r1)->seqIDarchive = r2->seqIDarchive;
    	(*r1)->chrLen = r2->chrLen;				
        #endif
	}
    }
    if (r2 && (sigstr == "" || sigstr == r2->getSignature())) {
	// append fragments of r2, right behind the fragments of r1
	for(vector<fragment>::const_iterator it = r2->frags.begin(); it != r2->frags.end(); ++it)
	    (*r1)->addFragment(it->chrPos, it->aliPos + aliLen1, it->len);
    }
}

ostream& operator<< (ostream& strm, const AlignmentRow &row){
    strm << row.seqID << " " << row.strand << "\t" << row.chrStart() << ".." << row.chrEnd() << "\t" << row.getSeqLen() << "\t" << row.getCumFragLen();
    strm << "\t( ";
    for (vector<fragment>::const_iterator it = row.frags.begin(); it != row.frags.end(); ++it)
        strm << "(" << it->chrPos << ", " << it->aliPos << ", " << it->len << ") ";
    strm << ")";
    return strm;
}

ostream& operator<< (ostream& strm, const Alignment &a){
    strm << a.aliLen << " alignment columns" << endl;
    for (size_t i = 0; i < a.rows.size(); i++){
	strm << "row " << setw(3) << i << "\t";
	if (a.rows[i])
	    strm << *a.rows[i];
	else 
	    strm << "missing";
	strm << endl;
    }
    return strm;
}

// printing a summary of an alignment
// like above variant  "cout << alignment", but with text graphics and spaces instead of tabs
// XXXXXXXXXXXX                                                                                                                                                                                                                                                       
// XXXXXXXXXXXXXXXXXXXXXXXXX                                                                                                                                                                                                                                          
//     XXXXXXXXXXXXXXXXXXXXX 
void Alignment::printTextGraph(ostream& strm){
    const int graphWidth = 100;
    strm << aliLen << " alignment columns" << endl;
    
    // determine maximal sequence id width across rows for new tabular printing
    int maxIdLen = 2;
    for (size_t i = 0; i < rows.size(); i++){
        if (rows[i] && maxIdLen < rows[i]->seqID.length())
	    maxIdLen = rows[i]->seqID.length();
    }
    for (size_t i = 0; i < rows.size(); i++){
      strm << "row " << setw(3) << i << "\t";
      if (rows[i]) {
  	  AlignmentRow &row = *rows[i];
 	  strm << setw(maxIdLen+1) << row.seqID << " " << row.strand << setw(10) << row.chrStart() << setw(10) << row.chrEnd() << setw(10) << row.getSeqLen() << setw(8) << row.getCumFragLen();
	  bitset<graphWidth> aligned;
	  for (vector<fragment>::const_iterator it = row.frags.begin(); it != row.frags.end(); ++it) {
	      for (unsigned j = graphWidth * it->aliPos/aliLen; j < (float) graphWidth * (it->aliPos+it->len) / aliLen && j < graphWidth; j++)
	          aligned[j] = true;
	  }
	  strm << "    |";
	  for (unsigned j = 0; j < graphWidth; j++)
	      strm << (aligned[j]? "X" : " ");
	  strm << "|";
      } else  
	  strm << "missing";
      strm << endl;
    }
}

bool mergeable (Alignment *a1, Alignment *a2, int maxGapLen, float mergeableFrac, bool strong){
    int misMatches = 0;
    int maxNumMatches = 0; // weak case: number of rows, where both alignments have fragments
                           // strong case: number of rows, where at least one alignment has fragments
    // this upper bound is for shortcutting the loop, when there are (very) many species
    int veryMaxMisMatches = (1.0 - mergeableFrac) * a1->numRows();
    for (int s=0; s < a1->numRows() && misMatches <= veryMaxMisMatches; s++){
        if (a1->rows[s] && a2->rows[s]){
	    maxNumMatches++;
	    int dist = a2->rows.at(s)->chrStart() - a1->rows.at(s)->chrEnd() - 1; // 0 distance means direct adjacency
	    if (a1->rows[s]->seqID != a2->rows[s]->seqID || 
		a1->rows[s]->strand != a2->rows[s]->strand || dist < 0 || dist > maxGapLen)
		misMatches++;
	} else if (strong && (a1->rows[s] || a2->rows[s])){
	    // in the strong sense of mergeability, rows in which only 1 alignment is present count as mismatches
	    maxNumMatches++;
	    misMatches++;
	}
    }
    return (misMatches <= ((int) ((1.0 - mergeableFrac) * maxNumMatches)));
}

/*
 *
 *  |      this             |   |      other      |
 *   xxxx xxxx     xxxxx         xxxxxx  xxx xxxx
 *   xxxxxxxxxxxxxxxxxxxxxxx        xxxx   xxxxxxx
 *          NULL                  xxxxxxxxxxxxxxx
 *     xxxxxx xxxxxxxxxxxxxx           NULL
 *        xxxxxxx xxxxxxxxxx     xxxxx         xxx
 *
 *
 */
void Alignment::merge(Alignment *other, const MsaSignature *sig){
    for (size_t s=0; s<numRows(); s++)
	if (sig)
	    appendRow(&rows[s], other->rows[s], aliLen, sig->sigrows[s]);
	else
	    appendRow(&rows[s], other->rows[s], aliLen, "");
    aliLen += other->aliLen;
}

Alignment* mergeAliList(list<Alignment*> alis,  const MsaSignature *sig){
    if (alis.empty())
	return NULL;
    // for the species not yet fixed by sig, find the signature with the longest alignment length (majority rule)
    // this could be improved, because same-signature alignments can be unmergeable
    list<Alignment*>::iterator it = alis.begin();
    int k = (*it)->numRows();
    MsaSignature *extSig = new MsaSignature(*sig); // extended signature, not-yet-specified rows can be added
    // count the cumulative fragment length for each signature that occurs
    vector<map<string, int> > sigweights(k);
    map<string, int>::iterator sigit;
    for (; it != alis.end(); ++it){
	for (int i=0; i<k; i++){
	    if ((*it)->rows[i]){
		string sigstr = (*it)->rows[i]->getSignature();
		int weight = (*it)->rows[i]->getCumFragLen();
		sigit = sigweights[i].find(sigstr);
		sigweights[i][sigstr] += weight;
	    }
	}
    }
    // determine the extended signature
    for (int i=0; i<k; i++){
	if (sig->sigrows[i] == ""){
	    int curmax = -1;
	    for (sigit = sigweights[i].begin(); sigit != sigweights[i].end(); ++sigit){
		if (sigit->second > curmax){ // new current maximum
		    extSig->sigrows[i] = sigit->first;
		    curmax = sigit->second;
		}
	    }
	    //if (curmax >= 0)
	    //  cout << "extending signature of " << i << " by " << extSig->sigrows[i] << " weight " << curmax << endl;
	}	
    }
    // cout << "ext signature from path:\t" << extSig->sigstr() << endl;
    Alignment *ma = new Alignment(k);
    // merge all alignments with respect to extended signature into a single alignment ma
    for (it = alis.begin(); it != alis.end(); ++it){
	ma->merge(*it, extSig);
    }
    delete extSig;
    for (int i=0; i<k; i++){ // remove alignment rows which are factually empty
	if (ma->rows[i] && (ma->rows[i]->frags.empty() || ma->rows[i]->getCumFragLen() < 1)){
	    delete ma->rows[i];
	    ma->rows[i] = NULL;
	}
    }
    // cout << "merged alignment:" << *ma << endl;
    return ma;
}

/**
 * For each alignment in the 'alis' list, if necessary, chunk it into piece alignments
 * that have sequence ranges of size at most maxRange. The new alignments are 
 * added to the end. The maxRange limit can be violated in the untypical case that a single
 * gapless fragment exceed this length threshold.
 * The (recursively chosen) split point is before the largest gap in the interval [2/3*maxRange, maxRange].
 */
void capAliSize(list<Alignment*> &alis, int maxRange){
    typedef list<Alignment*>::iterator AliIt;
    if (alis.empty())
	return; 

    AliIt ait = alis.begin();
    int k = (*ait)->numRows(); // number of species
    for (; ait != alis.end(); ++ait){
	if ((*ait)->maxRange() > maxRange){
	    Alignment *a = *ait;
	    //cout << "splitting up alignment with maxRange= " << (*ait)->maxRange() << endl << *a << endl;
	    /* find a split "point" in a left-to-right pass wrt to alignment positions
	     *                              | candidate boundary of cut (only complete fragments are taken)
	     *                              V
	     * xxxxxxxxx                    xxxxxxxxx             xxxxx
	     * xxx           xxxxxx         xx xxx xxxxxxx        xxxxxx
	     * NULL
	     *  xxxxxx     xxxx               xxxxxxxxxxxx           xxxxx
	     *                              ^            ^        ^         
	     * fragment with leftmost begin |            |        | leftmost begin of frags following
	     *        (held in priority queue)    rightmost end of queued frags  
	     * gapsize is the sum of the genomic sequence gaps over all species
	     */

	    priority_queue<BoundaryFragment, vector<BoundaryFragment>, CompareBoundaryFragment > q;
	    vector<int> bidx(k, -1);
	    vector<int> aliPosCutScore(min(a->aliLen,maxRange + 1),0), maxIntraGapSize(k,0);
	    int maxAliPos = maxRange;
	    vector<pair<size_t,size_t>> thresholds; // <threshold,bonus>
	    thresholds.push_back(make_pair(0,0)); thresholds.push_back(make_pair(maxRange/2,maxRange/2)); thresholds.push_back(make_pair(2*maxRange/3,2*maxRange/3));
	    vector<size_t> actThresholdIndex(k,0);

	    for (size_t s=0; s<k; s++)
		if (a->rows[s] && !a->rows[s]->frags.empty()){
		    q.push(BoundaryFragment(s, a->rows[s]->frags[0].aliPos)); // add first fragment of every alignment row
		}
	    while (!q.empty()){
		// retrieve leftmost boundary fragment
		BoundaryFragment bf = q.top();
		size_t s = bf.first;
		// if the next fragment (already verified) exceeds the maxAliPos (max allowed position) cut the alignment at the position with maximum cutScore and end the while loop by emptying the queue
		if (a->rows[s]->frags[bidx[s] + 1].chrPos - a->rows[s]->frags[0].chrPos + 1 > maxAliPos){
		    //cout << a->rows[s]->seqID << ": " << a->rows[s]->frags[bidx[s] + 1].chrPos << " - " << a->rows[s]->frags[0].chrPos + 1 << " > " << maxRange << endl;

		    /*int testMax = 0;
		      for (size_t spe = 0; spe < k; spe++){
		      if (a->rows[spe] && !a->rows[spe]->frags.empty()){
		      int test = a->rows[spe]->chrEnd() - a->rows[spe]->frags[0].chrPos + 1;
		      if (test > testMax)
		      testMax = test;
		      }
		      }
		      int testLen = a->aliLen;*/

		    // update maxAliPos, if we found a smaller one
		    if (maxAliPos > a->rows[s]->frags[bidx[s] + 1].aliPos - 1){
			maxAliPos = a->rows[s]->frags[bidx[s] + 1].aliPos - 1;
		    }
		    // set every score after maxAliPos to -1 (should not be considered anymore)
		    for (size_t actAliPos = maxAliPos + 1; actAliPos < aliPosCutScore.size(); actAliPos++){
			aliPosCutScore[actAliPos] = -1;             // dont cut here! Because of this we set the score to a negative number 
		    }
		    // search for the maxCutScore and the related cutAliPos
		    int maxCutScore = 0;
		    //int lastScore = 0;
		    size_t cutAliPos = 0;
		    for (size_t actAliPos = 0; actAliPos < aliPosCutScore.size(); actAliPos++){
			//if (/*aliPosCutScore[actAliPos] > 0 &&*/ aliPosCutScore[actAliPos] != lastScore)
			//	cout << actAliPos << ": " << aliPosCutScore[actAliPos] << endl;
			//    lastScore = aliPosCutScore[actAliPos];
			if (aliPosCutScore[actAliPos] >= maxCutScore){
			    maxCutScore = aliPosCutScore[actAliPos];
			    cutAliPos = actAliPos;
			}
		    }

		    /*cout << "CutAlnPart:" << endl;
		      for (size_t sss = 0; sss < a->rows.size(); sss++){
		      if (a->rows[sss]){
		      for (int i = 0; i < a->rows[sss]->frags.size(); i++){
		      if ((i+3 >= a->rows[sss]->frags.size() || (i+3 < a->rows[sss]->frags.size() && a->rows[sss]->frags[i+3].aliPos > cutAliPos)) && (i-3 < 0 || (i-3 >= 0 && a->rows[sss]->frags[i-3].aliPos < cutAliPos)))
		      cout << "CutFragments of " << a->rows[sss]->seqID << ": ChrPos: " << a->rows[sss]->frags[i].chrPos << " bis " << a->rows[sss]->frags[i].chrEnd() << " ; AlnPos: " << a->rows[sss]->frags[i].aliPos << " bis " << a->rows[sss]->frags[i].aliPos + a->rows[sss]->frags[i].len - 1 << endl;
		      }
		      }
		      }*/
		    // with this boundary fragment the alignment becomes too long
		    // create new alignment from initial part up to cutAliPos
		    Alignment *newAli = new Alignment(k);
		    int newAliLen = 0, offset = INT_MAX;
		    for (size_t ss = 0; ss < k; ss++){
			if (a->rows[ss]){
			    // AlignmentRow *&row = newAli->rows[ss];
			    newAli->rows[ss] = new AlignmentRow();
			    newAli->rows[ss]->seqID = a->rows[ss]->seqID;
			    newAli->rows[ss]->strand = a->rows[ss]->strand;

                #ifdef TESTING
			    newAli->rows[ss]->seqIDarchive = a->rows[ss]->seqIDarchive;
			    newAli->rows[ss]->chrLen = a->rows[ss]->chrLen;				
                #endif
                
			    vector<fragment>::iterator first = a->rows[ss]->frags.begin(), 
				last = a->rows[ss]->frags.end(), it, itCutPos;

			    // copy the initial part, save the cutPosIterator and test if a hardCut (cut throw fragment) is necessary
			    bool hardCut = false;
			    itCutPos = last;
			    for (it = first; it != last; it++){
				if ((*it).aliPos + (*it).len - 1 <= cutAliPos)
				    newAli->rows[ss]->addFragment(*it);
				else{
				    itCutPos = it;
				    if ((*it).aliPos <= cutAliPos)
					hardCut = true;
				    break;
				}
			    }
			    // do the hardCut
			    if (hardCut){
				newAli->rows[ss]->addFragment(*itCutPos);
				newAli->rows[ss]->frags.back().len = cutAliPos - (*itCutPos).aliPos + 1;
				(*itCutPos).len -= newAli->rows[ss]->frags.back().len;
				(*itCutPos).aliPos = cutAliPos + 1;
				(*itCutPos).chrPos = (*itCutPos).chrPos + newAli->rows[ss]->frags.back().len;
			    }
			    // update newAliLen as maximum of all new alignment positions
			    if (newAli->rows[ss]->aliEnd() > newAliLen)
				newAliLen = newAli->rows[ss]->aliEnd();
			    // delete the fragments just moved to the new alignment from the old alignment a
			    a->rows[ss]->frags.erase(first, itCutPos);
			    a->rows[ss]->setCumFragLen(a->rows[ss]->getCumFragLen() - newAli->rows[ss]->getCumFragLen());
			    if (a->rows[ss]->frags.empty()){
				delete a->rows[ss]; 
				a->rows[ss] = NULL;
			    }
			    // update offset as minimum of all remaining alignment positions
			    if (a->rows[ss] && a->rows[ss]->frags.size() > 0 && offset > a->rows[ss]->frags[0].aliPos)
				offset = a->rows[ss]->frags[0].aliPos;
			}
		    }
		    newAli->aliLen = newAliLen;
		    //cout << "new alignment: " << *newAli << endl;
		    // left-shift all alignment positions in the remaining alignment fragments by offset
		    a->shiftAliPositions(-offset);
		    // append remaining alignment to the end, required further shortening will be done when its turn comes in the main loop


		    /*int test1Max = 0, test2Max = 0;
		      for (size_t spe = 0; spe < k; spe++){
		      if (newAli->rows[spe] && !newAli->rows[spe]->frags.empty()){
		      int test1 = newAli->rows[spe]->chrEnd() - newAli->rows[spe]->frags[0].chrPos + 1;
		      if (test1 > test1Max)
		      test1Max = test1;
		      }
		      if (a->rows[spe] && !a->rows[spe]->frags.empty()){
		      int test2 = a->rows[spe]->chrEnd() - a->rows[spe]->frags[0].chrPos + 1;
		      if (test2 > test2Max)
		      test2Max = test2;
		      }
		      }
		      cerr << a->rows[s]->seqID << ": " << testLen << " = " << newAli->aliLen << " + " << a->aliLen << " (" << newAli->aliLen + a->aliLen << ") " << " --- " << testMax << " >= " << test1Max << " + " << test2Max << endl;*/

		    if (a->numFilledRows() > 1)
			alis.push_back(a);
		    else // remaining alignment empty or has just one row
			delete a;
		    // replace old long alignment with its initial part
		    *ait = newAli;
		    // clear the queue, so the while loop exits and the flow proceeds to the next alignment
		    while (!q.empty())
			q.pop();
		} else {
		    q.pop();
		    // actualize the fragment-index of the actual species to the actual fragment
		    bidx[s]++;
		    // if the end of the actual fragment is bigger than maxRange, then update maxAliPos
		    if (a->rows[s]->frags[bidx[s]].chrEnd() - a->rows[s]->frags[0].chrPos + 1 > maxRange){
			size_t tempMaxAliPos = a->rows[s]->frags[bidx[s]].aliPos + (maxRange - (a->rows[s]->frags[bidx[s]].chrPos - a->rows[s]->frags[0].chrPos + 1));
			if (maxAliPos > tempMaxAliPos)
			    maxAliPos = tempMaxAliPos;
		    }

		    int actFragStart = a->rows[s]->frags[bidx[s]].chrPos - a->rows[s]->frags[0].chrPos + 1;
		    int actFragEnd = a->rows[s]->frags[bidx[s]].chrEnd() - a->rows[s]->frags[0].chrPos + 1;

		    // if this is not the last fragment
		    if (bidx[s] + 1 < a->rows[s]->frags.size()){
			// calculate the size of the gap after this fragment until the next fragment
			size_t tempSize = a->rows[s]->gapLenAfterFrag(bidx[s]);
			// save the maximum gapSize of this species to add this as score for every possible cut position after the last fragment of this species
			if (maxIntraGapSize[s] < tempSize)
			    maxIntraGapSize[s] = tempSize;
			int nextFragStart = a->rows[s]->frags[bidx[s] + 1].chrPos - a->rows[s]->frags[0].chrPos + 1;
			// search for the right actual threshold (normaly this loop will only )
			while (actThresholdIndex[s]+1 < thresholds.size() && nextFragStart >= thresholds[actThresholdIndex[s]+1].first){
			    actThresholdIndex[s]++;
			}
			// from end of actual fragment to the position before the start of the next fragment
			for (size_t actAliPos = a->rows[s]->frags[bidx[s]].aliPos + a->rows[s]->frags[bidx[s]].len - 1; actAliPos < min(maxAliPos + 1, a->rows[s]->frags[bidx[s] + 1].aliPos); actAliPos++){
			    aliPosCutScore[actAliPos] += tempSize + thresholds[actThresholdIndex[s]].second;
			}

			// from start of next fragment to 1 pos before end of next fragment (or maxAliPos)
			int tempChrPos = nextFragStart - a->rows[s]->frags[bidx[s] + 1].aliPos;
			for (size_t actAliPos = a->rows[s]->frags[bidx[s] + 1].aliPos; actAliPos < min(maxAliPos + 1, a->rows[s]->frags[bidx[s] + 1].aliPos + a->rows[s]->frags[bidx[s] + 1].len - 1); actAliPos++){
			    // if the fragment lies on the next threshold or it is already the last threshold, only add bonusScore until one position before the last position in the threshold (because we cut behind this position and for cutting behind the last position we have another score)
			    if (actThresholdIndex[s] + 1 == thresholds.size() || (actThresholdIndex[s] + 1 < thresholds.size() && tempChrPos + actAliPos < thresholds[actThresholdIndex[s] + 1].first)){
				aliPosCutScore[actAliPos] += thresholds[actThresholdIndex[s]].second;
			    }else{break;}
			}
			// add the next fragment of this species and this alignment to the queue
			q.push(BoundaryFragment(s, a->rows[s]->frags[bidx[s] + 1].aliPos));
		    }else{ // if this is the last fragment add the 1 + maximum gap size of this species to score (and dont forget the threshold bonus)
			for (size_t actAliPos = a->rows[s]->frags[bidx[s]].aliPos + a->rows[s]->frags[bidx[s]].len - 1; actAliPos < aliPosCutScore.size(); actAliPos++){
			    aliPosCutScore[actAliPos] += maxIntraGapSize[s] + 1 + thresholds[actThresholdIndex[s]].second;
			}
		    }

		    // if the fragment lies exactly on the threshold we had to add the threshold score for the positions of the fragment who came behind the threshold
		    if (actFragStart < thresholds[actThresholdIndex[s]].first && actFragEnd >= thresholds[actThresholdIndex[s]].first){
			// from fragmentCutPos to one pos before end of fragment (or maxAliPos)
			for (size_t actAliPos = a->rows[s]->frags[bidx[s]].aliPos + (thresholds[actThresholdIndex[s]].first - actFragStart); actAliPos < min(maxAliPos + 1, a->rows[s]->frags[bidx[s]].aliPos + a->rows[s]->frags[bidx[s]].len - 1); actAliPos++){
			    aliPosCutScore[actAliPos] += thresholds[actThresholdIndex[s]].second;
			}
		    }
		}
	    }
	}
    } 
}

struct SegSortCriterion {
    bool operator() (const tuple<int, set<int>, set<int> > &lhs, const tuple<int, set<int>, set<int> > &rhs) {
        return get<0>(lhs) < get<0>(rhs);
    }
};

/*
 * Remove an optimally chosen subset of alignments from alis based on a geneRange overlap criterion.
 * Genomic bases in any species that are covered by some alignment in the input set but not by any in the subset
 * are penalized by uncovPen. Genomic bases that are covered by more than maxCov alignments 
 * in the subset are penalized by covPen for each additional covering alignment.
 * A minimum penalty subset of alignments is chosen and the remaining alinment are removed from alis.
 * Increasing the coverage penalty covPen punishes long overlaps more.
 * Decreasing maxCov punishes multiple overlapping geneRanges more.
 */
void reduceOvlpRanges(list<Alignment*> &alis, size_t maxCov, float covPen){
    if (alis.empty())
	return; 
    float uncovPen = 1.0; // no need to change this

    typedef list<Alignment*>::iterator AliIt;
    AliIt ait = alis.begin();
    int k = (*ait)->numRows(); // number of species
    int n = alis.size(); // number of unfiltered alignments
    /* segments will hold for each species.chr combination a sorted list of nonoverlapping 
     * parts of sequence ranges together with the set of alignment indices that cover it.
     * Before the algorithm below the triple holds 
     *    1. a genomic coordinate
     *    2. a singleton set of an index of an alignment that start at this coord
     *    3. a singleton set of an index of an alignment that ends at this coord
     * Upon finishing of the algorithm the second component is
     *    2. the set of indices of alignments that cover coord up to and excluding the next coord
     */
    map<string, list<tuple<int, set<int>, set<int> > > >  segments;
    
    int i; // alignments are identified by their index in alis list
    // insert alignment sequence ranges into lists
    for (i=0; ait != alis.end(); ++ait, i++){
	Alignment *a = *ait;
	//cout << "i=" << i << "\t";
	//a->printTextGraph(cout);
	for (size_t s=0; s<k; s++){
	    if (a->rows[s]){
		string key = string(itoa(s)) + "." + a->rows[s]->seqID;
		set<int> diffset, emptyset;
		diffset.insert(i);
		// insert the two segment boundaries
		segments[key].push_back(make_tuple(a->rows[s]->chrStart(), diffset, emptyset)); // alignment i starts
		segments[key].push_back(make_tuple(a->rows[s]->chrEnd()+1, emptyset, diffset)); // alignment i ends
	    }
	}
    }

    /* Algorithm to make segment lists overlap-free and nonredundant
     * e.g. turn --------------         into 4 nonoverlapping segments
     *                ---------
     *                   -----------
     */
    for (auto it = segments.begin(); it != segments.end(); ++it){ // loop over species.chr
	string key = it->first;
	auto &segs = it->second; // segment list for this species.chr combination, reference as list is changed
	// sort intervals by genomic coordinate
	segs.sort(SegSortCriterion());
	set<int> curSet; // set of alignment indices that cover the segment starting at current pos

	for (auto lit = segs.begin(); lit != segs.end(); ){
	    curSet.insert(get<1>(*lit).begin(), get<1>(*lit).end()); // add alignment that starts here (if any)
	    // remove (singleton) set of alignments ending here
	    if (!get<2>(*lit).empty())
		curSet.erase(*(get<2>(*lit).begin()));
	    if (next(lit) != segs.end()	&& get<0>(*lit) == get<0>(*next(lit))){ // next segment has same coord
		lit = segs.erase(lit); // remove -> always keep only last segment with this coord
	    } else {
		get<1>(*lit) = curSet;
		++lit;
	    }
	}
	/* DEBUG print alignments covering each nonoverlapping segment
	cout << key << "\t";
	for (auto lit = segs.begin(); lit != segs.end(); ++lit){
	    cout << " " << get<0>(*lit) << ":";
	    for (auto ait = get<1>(*lit).begin(); ait != get<1>(*lit).end(); ++ait)
		cout << *ait << ",";
	    cout << "\t";
	}
	cout << endl;*/
    }

    /* 
     * set up a mixed integer linear program to be solved by the lpsolve library
     */
    
    int R=0; // number of segments, two inequality constraints per segment
    for (auto it = segments.begin(); it != segments.end(); ++it) // loop over species.chr
	for (auto lit = it->second.begin(); next(lit) != it->second.end(); ++lit)	
	    if (!get<1>(*lit).empty())
		R++;
    
    lprec *lp; // linear program object
    int Ncol = n + 2*R;
    // order of variables
    // x_1, ..., x_n    binary x_i=1 iff i-th alignment is chosen in subset
    // y_1, ..., y_R    implicitly binary, y_j = 1 iff j-th segment is not covered (in optimal solution)
    // z_1, ..., z_R    implicitly int, z_j = no of coverages of j-th segment in excess of maxCov (in opt. sol.)

    lp = make_lp(0, Ncol); // model with Ncol columns and yet 0 rows
    if (lp == NULL)
	throw ProjectError("reduceOvlpRanges: Could not construct linear program");
    
    // create space for one row
    int *colno = new int[Ncol];
    REAL *row = new REAL[Ncol];

    int r = 0; // 0 <= r < R running index for segment
    int j; // j running index for constraint
    vector<int> segLen(R); // segment lengths = weights in linear program
    for (auto it = segments.begin(); it != segments.end(); ++it){ // loop over species.chr    
	auto segs = it->second; // segment list for this species.chr combination
	for (auto lit = segs.begin(); next(lit) != segs.end(); ++lit){
	    if (!get<1>(*lit).empty()){ // intermediate segments may also be uncovered by any alignment
		segLen[r] = get<0>(*next(lit)) - get<0>(*lit); // segment length, a multiplier
		// TODO: this may potentially be improved if the weighting uses the actually aligned bases in the segment
		// rather than just the length of the region spanned by the last and first aligned base.
		// cout << "segLen[" << r << "]=" << segLen[r] << endl;
		// add constraints like x2 + x5 + x7 + y1 >= 1 to penalize no coverage (y1 = 1)
		j = 0;
		for (auto ait = get<1>(*lit).begin(); ait != get<1>(*lit).end(); ++ait){
		    colno[j] = *ait + 1;
		    row[j++] = 1;
		}
		colno[j] = n + r + 1;
		row[j++] = 1;
		add_constraintex(lp, j, row, colno, GE, 1);

		// add constraints like x2 + x5 + x7 - z1 <= 3 to penalize too much coverage (z1>0)
		j = 0;
		for (auto ait = get<1>(*lit).begin(); ait != get<1>(*lit).end(); ++ait){
		    colno[j] = *ait + 1;
		    row[j++] = 1;
		}
		colno[j] = n + R + r + 1;
		row[j++] = -1;
		add_constraintex(lp, j, row, colno, LE, maxCov);
		r++;
	    }
	}
    }

    // make x_i's binary variables; x_i = 1 iff i-th alignment is chosen for subset
    for (int i=1; i <= n; i++)
	set_binary(lp, i, TRUE);

    set_add_rowmode(lp, FALSE);
    
    // objective function: sum_j noCovPen * y_j + uncovPen * z_j
    j = 0;
    for (r=0; r<R; r++){
	 colno[j] = n + R + r + 1;
	 row[j++] = covPen * segLen[r];
	 colno[j] = n + r + 1;
	 row[j++] = uncovPen * segLen[r];
    }
    set_obj_fnex(lp, j, row, colno);
    // write_LP(lp, stdout);
    set_verbose(lp, IMPORTANT);
    int ret = solve(lp);
    if (ret == 0){
	cout << "MILP objective=" << get_objective(lp) << endl;
	get_variables(lp, row);
	cout << "removing alignments ";
	for (i=0; i < n; i++)
	    if (row[i]<0.5)
		cout << i << " ";
	cout << endl;
	/*
	  for (r=0; r<R; r++){
	  if (row[n+r] + row[n+R+r] > 0)
	  cout << r+1 << "\ty=" << row[n+r] << "\tz=" << row[n+R+r] << endl; 
	  }*/

	/*
	 * remove alignments from alis that are not in the optimal subset
	 * and compute some statistics
	 */ 
	i = 0;
	int numRemovedAlis = 0;
	int cumFragLenBefore = 0, cumFragLenRemoved = 0;
	for (auto ait = alis.begin(); ait != alis.end(); i++){
	    int cfl = (*ait)->getCumFragLen();
	    cumFragLenBefore += cfl;
	    if (row[i] < 0.5){
		cumFragLenRemoved += cfl;
		ait = alis.erase(ait);
		numRemovedAlis++;
	    } else
		++ait;
	}
	int segLenBefore = 0, segLenRemoved = 0;
	for (r=0; r<R; r++){
	    segLenBefore += segLen[r];
	    if (row[n + r + 1] > 0.5)
		segLenRemoved += segLen[r];
	}

	cout << numRemovedAlis << " geneRanges=alignments (" 
	     << (0.1 * (int) (1000.0 * numRemovedAlis / (numRemovedAlis + alis.size())))
	     << "%) and " << cumFragLenRemoved << " / " << cumFragLenBefore
	     << " = " << (0.1 * (int) (1000.0 * cumFragLenRemoved / cumFragLenBefore))
	     << "% of aligned bases and " << segLenRemoved << " / " << segLenBefore << " = "
	     << (0.1 * (int) (1000.0 * segLenRemoved / segLenBefore))
	     << "% of aligment-range-covered bases were deleted because of too much redundancy." << endl;
    } else { // if this sometimes does not work it is not a big deal
	cerr << "Warning: Could not optimally solve mixed linear integer program. Return value= " << ret << endl;
    }

    // temp for debug, output filtered alignment list
    /*
    cout << "filtered alignments: " << alis.size() << endl;
    for (auto ait = alis.begin(); ait != alis.end(); ++ait){
	(*ait)->printTextGraph(cout);
	cout << endl;
	}*/

    delete [] colno;
    delete [] row;
}

int Alignment::maxRange(){
    int range, max = 0;
    for(size_t s=0; s<numRows(); s++){
	range = rows[s]? rows[s]->getSeqLen() : 0;
	if (range > max)
	    max = range;
    }
    return max;
}

int Alignment::numFilledRows() const {
    int m = 0;
    for (size_t s=0; s<rows.size(); s++)
	if (rows[s])
	    m++;
    return m;
}

int Alignment::getMaxSeqIdLen() const {
    int maxNameLen = 0;
    for(size_t s=0; s<numRows(); s++)
	if (rows[s] && rows[s]->seqID.length() > maxNameLen)
	    maxNameLen = rows[s]->seqID.length();
    
    return maxNameLen;
}

int medianChrStartEndDiff(Alignment *a1, Alignment *a2){
    if (a1->numRows() != a2->numRows())
	return 0;
    vector<int> diffs;
    for(size_t s=0; s<a1->numRows(); s++){
	AlignmentRow *r1 = a1->rows[s], *r2 = a2->rows[s];
	if (r1 && r2 && r1->strand == r2->strand && r1->seqID == r2->seqID){
	    if (r1->chrStart() - r2->chrEnd() >=0)
		diffs.push_back(r1->chrStart() - r2->chrEnd());
	}
    }
    return quantile(diffs, 0.5);
}

string Alignment::getSignature() const {
    string sig = "";
    for(size_t s=0; s<numRows(); s++)
	if (rows[s])
	    sig += itoa(s) + ":" + rows[s]->getSignature();
    return sig;
}

int Alignment::getCumFragLen() const{
    int cumFragLen = 0;
    for(size_t s=0; s<numRows(); s++)
	if (rows[s])
	    cumFragLen += rows[s]->getCumFragLen();
    return cumFragLen;
}

int Alignment::numFitting(const MsaSignature *sig) const{
    int numFitting = 0;
    for (size_t s = 0; s < numRows(); s++){
	if (sig->fits(*this, s))
	    numFitting++;
    }
    return numFitting;
}

void Alignment::shiftAliPositions(int offset){
    for(size_t s=0; s < numRows(); s++)
	if (rows[s]){
	    for (int i=0; i < rows[s]->frags.size(); i++)
		rows[s]->frags[i].aliPos += offset;
	}
    aliLen += offset;
}

// merge pairs of fragments without gap into one fragment making the alignment representation more compact
void Alignment::pack(){
    for(size_t s=0; s < numRows(); s++)
	if (rows[s])
	    rows[s]->pack();
}


int MsaSignature::maxSigStrLen = 0;

#ifdef TESTING
// convert the alignment according to new intervals/FASTA (start/end are in bed format)
void Alignment::convertAlignment(int r, int start, int end) {
    if(rows[r] != NULL){
        int newChrLen = end - start + 1;

        for(int f=0;f<rows[r]->frags.size();++f){
            if(rows[r]->strand == plusstrand){
                // cout << "converting " << start << " " << end << " " << rows[r]->seqID << " " << rows[r]->frags[f].chrPos << " " << start << " " << rows[r]->frags[f].chrPos - start << endl;
                rows[r]->frags[f].chrPos -= start;
            }
            else{
                rows[r]->frags[f].chrPos = rows[r]->chrLen - rows[r]->frags[f].chrPos;  // onto the positive strand
                rows[r]->frags[f].chrPos -= start;
                rows[r]->frags[f].chrPos =  newChrLen - rows[r]->frags[f].chrPos;       // back onto the negative strand (remark the new length is used)
            }
        }

        rows[r]->seqID = rows[r]->seqID + "_" +  to_string(start) + "_" + to_string(end+1); // the identifier is again in BED format (open interval)
        rows[r]->chrLen = newChrLen;
    }        
}
#endif

// StringAlignment stuff

ostream& operator<< (ostream& strm, const StringAlignment &msa){
    for (size_t s=0; s < msa.k; s++)
        if (msa.rows[s].length() > 0)
            strm << s << "\t" << msa.rows[s] << endl;
    return strm;
}

void StringAlignment::insert(std::list<MsaInsertion> &insList, int maxInsertLen){
    insList.sort();
    int oldinsertpos = -1;

    for (MsaInsertion msains : insList){
        if (rows[msains.s].empty())
            continue; // no insertion into empty rows

	size_t insertsize = msains.insert.length();
	if (insertsize > maxInsertLen){
	    // round down to a multiple of 3 for reading-frame consistency
	    // cout << "long insert " << insertsize << endl;
	    insertsize = (maxInsertLen / 3) * 3;
	}
        if (msains.insertpos != oldinsertpos){
            // new insert colums are always started with the longest
            // inserts to make sure there is enough space
            // insert gap colums for whole aligment
            for (size_t s=0; s < k; s++) {
                if (!rows[s].empty()){
                    rows[s].insert(msains.insertpos, string(insertsize, '-'));
                }
            }
            oldinsertpos = msains.insertpos;
        }
        // replace gaps left justified with insert
        rows[msains.s].replace(msains.insertpos, insertsize, msains.insert, 0, insertsize);
    }
}

size_t StringAlignment::removeGapOnlyCols(){
    size_t numRemovedCols = 0;

    for (int j = len - 1; j >= 0; --j){
	// TODO: remove consecutive ranges of all-gap cols at once for efficiency
        if (isGapOnlyCol(j)){
            // delete column j
            numRemovedCols++;
            for (size_t s=0; s < k; s++) {
                if (!rows[s].empty()){
                    rows[s].erase(j, 1);
                }
            }
        }
    }
    return numRemovedCols;
}