Codebase list libfastutil-java / 09029da4-8c44-476c-bae7-d2e6de63657f/upstream/8.5.11+git20230109.1.7084407+dfsg drv / ArrayFrontCodedList.drv
09029da4-8c44-476c-bae7-d2e6de63657f/upstream/8.5.11+git20230109.1.7084407+dfsg

Tree @09029da4-8c44-476c-bae7-d2e6de63657f/upstream/8.5.11+git20230109.1.7084407+dfsg (Download .tar.gz)

ArrayFrontCodedList.drv @09029da4-8c44-476c-bae7-d2e6de63657f/upstream/8.5.11+git20230109.1.7084407+dfsgraw · history · blame

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
/*
 * Copyright (C) 2002-2022 Sebastiano Vigna
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */


package PACKAGE;

import static it.unimi.dsi.fastutil.BigArrays.copyFromBig;
import static it.unimi.dsi.fastutil.BigArrays.copyToBig;
import static it.unimi.dsi.fastutil.BigArrays.grow;
import static it.unimi.dsi.fastutil.BigArrays.trim;


import it.unimi.dsi.fastutil.BigArrays;
import it.unimi.dsi.fastutil.objects.AbstractObjectList;
import it.unimi.dsi.fastutil.objects.ObjectListIterator;
#if ! KEY_CLASS_Long
import it.unimi.dsi.fastutil.longs.LongArrays;
#endif

import java.io.Serializable;
import java.util.Iterator;
import java.util.Collection;
import java.util.NoSuchElementException;
import java.util.RandomAccess;

/** Compact storage of lists of arrays using front-coding (also known as prefix-omission) compression.
 *
 * <p>This class stores immutably a list of arrays in a single {@linkplain it.unimi.dsi.fastutil.BigArrays big array}
 * using front coding (of course, the compression will be reasonable only if
 * the list is sorted lexicographically&mdash;see below). It implements an
 * immutable type-specific list that returns the <var>i</var>-th array when
 * calling {@link #get(int) get(<var>i</var>)}. The returned array may be
 * freely modified.
 *
 * <p>Front-coding (also known as prefix-omission) compression 
 * is based on the idea that if the <var>i</var>-th and the
 * (<var>i</var>+1)-th array have a common prefix, we might store the length
 * of the common prefix, and then the rest of the second array.
 *
 * <p>This approach, of course, requires that once in a while an array is
 * stored entirely.  The <em>ratio</em> of a front-coded list defines how
 * often this happens (once every {@link #ratio()} arrays). A higher ratio
 * means more compression, but means also a longer access time, as more arrays
 * have to be probed to build the result. Note that we must build an array
 * every time {@link #get(int)} is called, but this class provides also methods
 * that extract one of the stored arrays in a given array, reducing garbage
 * collection. See the documentation of the family of {@code get()}
 * methods.
 *
 * <p>By setting the ratio to 1 we actually disable front coding: however, we
 * still have a data structure storing large list of arrays with a reduced
 * overhead (just one integer per array, plus the space required for lengths).
 *
 * <p>Note that the typical usage of front-coded lists is under the form of
 * serialized objects; usually, the data that has to be compacted is processed
 * offline, and the resulting structure is stored permanently. Since the
 * pointer array is not stored, the serialized format is very small.
 *
 * <H2>Implementation Details</H2>
 *
 * <p>All arrays are stored in a {@linkplain it.unimi.dsi.fastutil.BigArrays big array}. A separate array of pointers
 * indexes arrays whose position is a multiple of the ratio: thus, a higher ratio
 * means also less pointers.
 *
 * <p>More in detail, an array whose position is a multiple of the ratio is
 * stored as the array length, followed by the elements of the array. The array
 * length is coded by a simple variable-length list of <var>k</var>-1 bit
 * blocks, where <var>k</var> is the number of bits of the underlying primitive
 * type.  All other arrays are stored as follows: let {@code common} the
 * length of the maximum common prefix between the array and its predecessor.
 * Then we store the array length decremented by {@code common}, followed
 * by {@code common}, followed by the array elements whose index is
 * greater than or equal to {@code common}. For instance, if we store
 * {@code foo}, {@code foobar}, {@code football} and
 * {@code fool} in a front-coded character-array list with ratio 3, the
 * character array will contain
 *
 * <pre>
 * <b>3</b> f o o <b>3</b> <b>3</b> b a r <b>5</b> <b>3</b> t b a l l <b>4</b> f o o l
 * </pre>
 */

public class ARRAY_FRONT_CODED_LIST extends AbstractObjectList<KEY_TYPE[]> implements Serializable, Cloneable, RandomAccess {

	private static final long serialVersionUID = 1L;

	/** The number of arrays in the list. */
	protected final int n;
	/** The ratio of this front-coded list. */
	protected final int ratio;
	/** The big array containing the compressed arrays. */
	protected final KEY_TYPE[][] array;
	/** The pointers to entire arrays in the list. */
	protected transient long[] p;

	/** Creates a new front-coded list containing the arrays returned by the given iterator.
	 *
	 * @param arrays an iterator returning arrays.
	 * @param ratio the desired ratio.
	 */

	public ARRAY_FRONT_CODED_LIST(final Iterator<KEY_TYPE[]> arrays, final int ratio) {

		if (ratio < 1) throw new IllegalArgumentException("Illegal ratio (" + ratio + ")");

		KEY_TYPE[][] array = BIG_ARRAYS.EMPTY_BIG_ARRAY;
		long[] p = LongArrays.EMPTY_ARRAY;

		KEY_TYPE[][] a = new KEY_TYPE[2][];
		long curSize = 0;
		int n = 0, b = 0, length;

		while(arrays.hasNext()) {
			a[b] = arrays.next();
			length = a[b].length;

			if (n % ratio == 0) {
				p = LongArrays.grow(p, n / ratio + 1);
				p[n / ratio] = curSize;

				array = grow(array, curSize + count(length) + length, curSize);
				curSize += writeInt(array, length, curSize);
				copyToBig(a[b], 0, array, curSize, length);
				curSize += length;
			}
			else {
				final int minLength = Math.min(a[1 - b].length, length);
				int common;
				for(common = 0; common < minLength; common++) if (a[0][common] != a[1][common]) break;
				length -= common;

				array = grow(array, curSize + count(length) + count(common) + length, curSize);
				curSize += writeInt(array, length, curSize);
				curSize += writeInt(array, common, curSize);
				copyToBig(a[b], common, array, curSize, length);
				curSize += length;
			}

			b = 1 - b;
			n++;
		}

		this.n = n;
		this.ratio = ratio;
		this.array = trim(array, curSize);
		this.p = LongArrays.trim(p, (n + ratio - 1) / ratio);

	}

	/** Creates a new front-coded list containing the arrays in the given collection.
	 *
	 * @param c a collection containing arrays.
	 * @param ratio the desired ratio.
	 */

	public ARRAY_FRONT_CODED_LIST(final Collection<KEY_TYPE[]> c, final int ratio) {
		this(c.iterator(), ratio);
	}



	/* The following (rather messy) methods implements the encoding of arbitrary integers inside a big array.
	 * Unfortunately, we have to specify different codes for almost every type. */

	/** Reads a coded length.
	 * @param a the data big array.
	 * @param pos the starting position.
	 * @return the length coded at {@code pos}.
	 */
	static int readInt(final KEY_TYPE a[][], long pos) {
#if KEY_CLASS_Integer
		return BigArrays.get(a, pos);
#elif KEY_CLASS_Long
		return (int)BigArrays.get(a, pos);
#elif KEY_CLASS_Character
		final char c0 = BigArrays.get(a, pos);
		return c0  < 0x8000 ? c0 : (c0 & 0x7FFF) << 16 | BigArrays.get(a, pos + 1);
#elif KEY_CLASS_Short
		final short s0 = BigArrays.get(a, pos);
		return s0 >= 0 ? s0 : s0 << 16 | (BigArrays.get(a, pos + 1) & 0xFFFF);
#else
		final byte b0 = BigArrays.get(a, pos);
		if (b0  >= 0) return b0;
		final byte b1 = BigArrays.get(a, pos + 1);
		if (b1 >= 0) return (- b0 - 1) << 7 | b1;
		final byte b2 = BigArrays.get(a, pos + 2);
		if (b2 >= 0) return (- b0 - 1) << 14 | (- b1 - 1) << 7 | b2;
		final byte b3 = BigArrays.get(a, pos + 3);
		if (b3 >= 0) return (- b0 - 1) << 21 | (- b1 - 1) << 14 | (- b2 - 1) << 7 | b3;
		return (- b0 - 1) << 28 | (- b1 - 1) << 21 | (- b2 - 1) << 14 | (- b3 - 1) << 7 | BigArrays.get(a, pos + 4);
#endif
	}

	/** Computes the number of elements coding a given length.
	 * @param length the length to be coded.
	 * @return the number of elements coding {@code length}.
	 */
	static int count(final int length) {
#if KEY_CLASS_Integer || KEY_CLASS_Long
		return 1;
#elif KEY_CLASS_Character || KEY_CLASS_Short
		return length < (1 << 15) ? 1 : 2;
#else
		if (length < (1 << 7)) return 1;
		if (length < (1 << 14)) return 2;
		if (length < (1 << 21)) return 3;
		if (length < (1 << 28)) return 4;
		return 5;
#endif
	}

	/** Writes a length.
	 * @param a the data array.
	 * @param length the length to be written.
	 * @param pos the starting position.
	 * @return the number of elements coding {@code length}.
	 */
	static int writeInt(final KEY_TYPE a[][], int length, long pos) {
#if KEY_CLASS_Long
		BigArrays.set(a, pos, length);
		return 1;
#elif KEY_CLASS_Integer
		BigArrays.set(a, pos, length);
		return 1;
#elif KEY_CLASS_Character
		if (length < (1 << 15)) {
			BigArrays.set(a, pos, (char)length);
			return 1;
		}
		BigArrays.set(a, pos++, (char)(length >>> 16 | 0x8000));
		BigArrays.set(a, pos, (char)(length & 0xFFFF));
		return 2;
#elif KEY_CLASS_Short
		if (length < (1 << 15)) {
			BigArrays.set(a, pos, (short)length);
			return 1;
		}
		BigArrays.set(a, pos++, (short)(- (length >>> 16) - 1));
		BigArrays.set(a, pos, (short)(length & 0xFFFF));
		return 2;
#else
		final int count = count(length);
		BigArrays.set(a, pos + count - 1, (byte)(length & 0x7F));

		for(int i = count -1; i-- != 0;) {
			length >>>= 7;
			BigArrays.set(a, pos + i, (byte)(- (length  & 0x7F) - 1));
		}

		return count;
#endif
	}



	/** Returns the ratio of this list.
	 *
	 * @return the ratio of this list.
	 */

	public int ratio() {
		return ratio;
	}


	/** Computes the length of the array at the given index.
	 *
	 * <p>This private version of {@link #arrayLength(int)} does not check its argument.
	 *
	 * @param array the data array.
	 * @param index an index.
	 * @return the length of the {@code index}-th array.
	 */
	private int length(final int index) {
		final KEY_TYPE[][] array = this.array;
		final int delta = index % ratio; // The index into the p array, and the delta inside the block.

		long pos = p[index / ratio]; // The position into the array of the first entire word before the index-th.
		int length = readInt(array, pos);

		if (delta == 0) return length;

		// First of all, we recover the array length and the maximum amount of copied elements.
		int common;
		pos += count(length) + length;
		length = readInt(array, pos);
		common = readInt(array, pos + count(length));

		for(int i = 0; i < delta - 1; i++) {
			pos += count(length) + count(common) + length;
			length = readInt(array, pos);
			common = readInt(array, pos + count(length));
		}

		return length + common;
	}


	/** Computes the length of the array at the given index.
	 *
	 * @param index an index.
	 * @return the length of the {@code index}-th array.
	 */
	public int arrayLength(final int index) {
		ensureRestrictedIndex(index);
		return length(index);
	}

	/** Extracts the array at the given index.
	 *
	 * @param index an index.
	 * @param a the array that will store the result (we assume that it can hold the result).
	 * @param offset an offset into {@code a} where elements will be store.
	 * @param length a maximum number of elements to store in {@code a}.
	 * @return the length of the extracted array.
	 */
	private int extract(final int index, final KEY_TYPE a[], final int offset, final int length) {
		final int delta = index % ratio; // The delta inside the block.
		final long startPos = p[index / ratio]; // The position into the array of the first entire word before the index-th.
		long pos, prevArrayPos;
		int arrayLength = readInt(array, pos = startPos), currLen = 0, actualCommon;

		if (delta == 0) {
			pos = p[index / ratio] + count(arrayLength);
			copyFromBig(array, pos, a, offset, Math.min(length, arrayLength));
			return arrayLength;
		}

		int common = 0;

		for(int i = 0; i < delta; i++) {
			prevArrayPos = pos + count(arrayLength) + (i != 0 ? count(common) : 0);
			pos = prevArrayPos + arrayLength;

			arrayLength = readInt(array, pos);
			common = readInt(array, pos + count(arrayLength));

			actualCommon = Math.min(common, length);
			if (actualCommon <= currLen) currLen = actualCommon;
			else {
				copyFromBig(array, prevArrayPos, a, currLen + offset, actualCommon - currLen);
				currLen = actualCommon;
			}
		}

		if (currLen < length) copyFromBig(array, pos + count(arrayLength) + count(common), a, currLen + offset, Math.min(arrayLength, length - currLen));

		return arrayLength + common;
	}

	/** {@inheritDoc}
	 * @implSpec This implementation delegates to {@link #getArray(int)}. */
	@Override
	public KEY_TYPE[] get(final int index) {
		return getArray(index);
	}

	/** Returns an array stored in this front-coded list.
	 *
	 * @param index an index.
	 * @return the corresponding array stored in this front-coded list.
	 */
	public KEY_TYPE[] getArray(final int index) {
		ensureRestrictedIndex(index);
		final int length = length(index);
		final KEY_TYPE a[] = new KEY_TYPE[length];
		extract(index, a, 0, length);
		return a;
	}

	/** Stores in the given array elements from an array stored in this front-coded list.
	 *
	 * @param index an index.
	 * @param a the array that will store the result.
	 * @param offset an offset into {@code a} where elements will be store.
	 * @param length a maximum number of elements to store in {@code a}.
	 * @return if {@code a} can hold the extracted elements, the number of extracted elements;
	 * otherwise, the number of remaining elements with the sign changed.
	 */
	public int get(final int index, final KEY_TYPE[] a, final int offset, final int length) {
		ensureRestrictedIndex(index);
		ARRAYS.ensureOffsetLength(a, offset, length);

		final int arrayLength = extract(index, a, offset, length);
		if (length >= arrayLength) return arrayLength;
		return length - arrayLength;
	}

	/** Stores in the given array an array stored in this front-coded list.
	 *
	 * @param index an index.
	 * @param a the array that will store the content of the result (we assume that it can hold the result).
	 * @return if {@code a} can hold the extracted elements, the number of extracted elements;
	 * otherwise, the number of remaining elements with the sign changed.
	 */
	public int get(final int index, final KEY_TYPE[] a) {
		return get(index, a, 0, a.length);
	}

	@Override
	public int size() {
		return n;
	}

	@Override
	public ObjectListIterator<KEY_TYPE[]> listIterator(final int start) {
		ensureIndex(start);

		return new ObjectListIterator<KEY_TYPE[]>() {
				KEY_TYPE s[] = ARRAYS.EMPTY_ARRAY;
				int i = 0;
				long pos = 0;
				boolean inSync; // Whether the current value in a is the string just before the next to be produced.

				{
					if (start != 0) {
						if (start == n) i = start; // If we start at the end, we do nothing.
						else {
							pos = p[start / ratio];
							int j = start % ratio;
							i = start - j;
							while(j-- != 0) next();
						}
					}
				}

				@Override
				public boolean hasNext() {
					return i < n;
				}

				@Override
				public boolean hasPrevious() {
					return i > 0;
				}

				@Override
				public int previousIndex() {
					return i - 1;
				}

				@Override
				public int nextIndex() {
					return i;
				}

				@Override
				public KEY_TYPE[] next() {
					int length, common;

					if (! hasNext()) throw new NoSuchElementException();

					if (i % ratio == 0) {
						pos = p[i / ratio];
						length = readInt(array, pos);
						s = ARRAYS.ensureCapacity(s, length, 0);
						copyFromBig(array, pos + count(length), s, 0, length);
						pos += length + count(length);
						inSync = true;
					}
					else {
						if (inSync) {
							length = readInt(array, pos);
							common = readInt(array, pos + count(length));
							s = ARRAYS.ensureCapacity(s, length + common, common);
							copyFromBig(array, pos + count(length) + count (common), s, common, length);
							pos += count(length) + count(common) + length;
							length += common;
						}
						else {
							s = ARRAYS.ensureCapacity(s, length = length(i), 0);
							extract(i, s, 0, length);
						}
					}
					i++;
					return ARRAYS.copy(s, 0, length);
				}

				@Override
				public KEY_TYPE[] previous() {
					if (! hasPrevious()) throw new NoSuchElementException();
					inSync = false;
					return getArray(--i);
				}
			};
	}


	/** Returns a copy of this list.
	 *
	 *  @return a copy of this list.
	 */
	@Override
	public ARRAY_FRONT_CODED_LIST clone() {
		return this;
	}

	@Override
	public String toString() {
		final StringBuffer s = new StringBuffer();
		s.append("[");
		for(int i = 0; i < n; i++) {
			if (i != 0) s.append(", ");
			s.append(ARRAY_LIST.wrap(getArray(i)).toString());
		}
		s.append("]");
		return s.toString();
	}

	/** Computes the pointer array using the currently set ratio, number of elements and underlying array.
	 *
	 * @return the computed pointer array.
	 */

	protected long[] rebuildPointerArray() {
		final long[] p = new long[(n + ratio - 1) / ratio];
		final KEY_TYPE a[][] = array;
		int length, count;
		long pos = 0;

		for(int i = 0, j = 0, skip = ratio - 1; i < n; i++) {
			length = readInt(a, pos);
			count = count(length);
			if (++skip == ratio) {
				skip = 0;
				p[j++] = pos;
				pos += count + length;
			}
			else pos += count + count(readInt(a, pos + count)) + length;
		}

		return p;
	}


	private void readObject(java.io.ObjectInputStream s) throws java.io.IOException, ClassNotFoundException {
		s.defaultReadObject();

		// Rebuild pointer array
		p = rebuildPointerArray();
	}
}