Change gen cust hufftables to accept dictionary
Change-Id: I4eed03bdb91030b16b3ecfd8076adc890e4f59a2
Signed-off-by: Greg Tucker <greg.b.tucker@intel.com>
Greg Tucker
3 years ago
63 | 63 | #include <stdlib.h> |
64 | 64 | #include "igzip_lib.h" |
65 | 65 | |
66 | #include "huff_codes.h" | |
67 | #include "huffman.h" | |
68 | ||
66 | 69 | /*These max code lengths are limited by how the data is stored in |
67 | 70 | * hufftables.asm. The deflate standard max is 15.*/ |
68 | 71 | |
232 | 235 | fprintf(output_file, "const uint32_t zlib_trl_bytes = %d;\n", ZLIB_TRAILER_SIZE); |
233 | 236 | } |
234 | 237 | |
238 | static uint32_t convert_dist_to_dist_sym(uint32_t dist) | |
239 | { | |
240 | assert(dist <= 32768 && dist > 0); | |
241 | if (dist <= 32768) { | |
242 | uint32_t msb = dist > 4 ? bsr(dist - 1) - 2 : 0; | |
243 | return (msb * 2) + ((dist - 1) >> msb); | |
244 | } else { | |
245 | return ~0; | |
246 | } | |
247 | } | |
248 | ||
249 | /** | |
250 | * @brief Returns the deflate symbol value for a repeat length. | |
251 | */ | |
252 | static uint32_t convert_length_to_len_sym(uint32_t length) | |
253 | { | |
254 | assert(length > 2 && length < 259); | |
255 | ||
256 | /* Based on tables on page 11 in RFC 1951 */ | |
257 | if (length < 11) | |
258 | return 257 + length - 3; | |
259 | else if (length < 19) | |
260 | return 261 + (length - 3) / 2; | |
261 | else if (length < 35) | |
262 | return 265 + (length - 3) / 4; | |
263 | else if (length < 67) | |
264 | return 269 + (length - 3) / 8; | |
265 | else if (length < 131) | |
266 | return 273 + (length - 3) / 16; | |
267 | else if (length < 258) | |
268 | return 277 + (length - 3) / 32; | |
269 | else | |
270 | return 285; | |
271 | } | |
272 | ||
273 | void isal_update_histogram_dict(uint8_t * start_stream, int dict_length, int length, | |
274 | struct isal_huff_histogram *histogram) | |
275 | { | |
276 | uint32_t literal = 0, hash; | |
277 | uint16_t seen, *last_seen = histogram->hash_table; | |
278 | uint8_t *current, *end_stream, *next_hash, *end, *end_dict; | |
279 | uint32_t match_length; | |
280 | uint32_t dist; | |
281 | uint64_t *lit_len_histogram = histogram->lit_len_histogram; | |
282 | uint64_t *dist_histogram = histogram->dist_histogram; | |
283 | ||
284 | if (length <= 0) | |
285 | return; | |
286 | ||
287 | end_stream = start_stream + dict_length + length; | |
288 | end_dict = start_stream + dict_length; | |
289 | ||
290 | memset(last_seen, 0, sizeof(histogram->hash_table)); /* Initialize last_seen to be 0. */ | |
291 | ||
292 | for (current = start_stream; current < end_dict - 4; current++) { | |
293 | literal = load_u32(current); | |
294 | hash = compute_hash(literal) & LVL0_HASH_MASK; | |
295 | last_seen[hash] = (current - start_stream) & 0xFFFF; | |
296 | } | |
297 | ||
298 | for (current = start_stream + dict_length; current < end_stream - 3; current++) { | |
299 | literal = load_u32(current); | |
300 | hash = compute_hash(literal) & LVL0_HASH_MASK; | |
301 | seen = last_seen[hash]; | |
302 | last_seen[hash] = (current - start_stream) & 0xFFFF; | |
303 | dist = (current - start_stream - seen) & 0xFFFF; | |
304 | if (dist - 1 < D - 1) { | |
305 | assert(start_stream <= current - dist); | |
306 | match_length = | |
307 | compare258(current - dist, current, end_stream - current); | |
308 | if (match_length >= SHORTEST_MATCH) { | |
309 | next_hash = current; | |
310 | #ifdef ISAL_LIMIT_HASH_UPDATE | |
311 | end = next_hash + 3; | |
312 | #else | |
313 | end = next_hash + match_length; | |
314 | #endif | |
315 | if (end > end_stream - 3) | |
316 | end = end_stream - 3; | |
317 | next_hash++; | |
318 | for (; next_hash < end; next_hash++) { | |
319 | literal = load_u32(next_hash); | |
320 | hash = compute_hash(literal) & LVL0_HASH_MASK; | |
321 | last_seen[hash] = (next_hash - start_stream) & 0xFFFF; | |
322 | } | |
323 | ||
324 | dist_histogram[convert_dist_to_dist_sym(dist)] += 1; | |
325 | lit_len_histogram[convert_length_to_len_sym(match_length)] += | |
326 | 1; | |
327 | current += match_length - 1; | |
328 | continue; | |
329 | } | |
330 | } | |
331 | lit_len_histogram[literal & 0xFF] += 1; | |
332 | } | |
333 | ||
334 | for (; current < end_stream; current++) | |
335 | lit_len_histogram[*current] += 1; | |
336 | ||
337 | lit_len_histogram[256] += 1; | |
338 | return; | |
339 | } | |
340 | ||
235 | 341 | int main(int argc, char *argv[]) |
236 | 342 | { |
237 | 343 | long int file_length; |
344 | int argi = 1; | |
238 | 345 | uint8_t *stream = NULL; |
239 | 346 | struct isal_hufftables hufftables; |
240 | 347 | struct isal_huff_histogram histogram; |
241 | 348 | struct isal_zstream tmp_stream; |
242 | FILE *file; | |
349 | FILE *file = NULL; | |
350 | FILE *dict_file = NULL; | |
351 | long int dict_file_length = 0; | |
352 | uint8_t *dict_stream = NULL; | |
243 | 353 | |
244 | 354 | if (argc == 1) { |
245 | 355 | printf("Error, no input file.\n"); |
246 | 356 | return 1; |
247 | 357 | } |
248 | 358 | |
359 | if (argc > 3 && argv[1][0] == '-' && argv[1][1] == 'd') { | |
360 | dict_file = fopen(argv[2], "r"); | |
361 | ||
362 | fseek(dict_file, 0, SEEK_END); | |
363 | dict_file_length = ftell(dict_file); | |
364 | fseek(dict_file, 0, SEEK_SET); | |
365 | dict_file_length -= ftell(dict_file); | |
366 | dict_stream = malloc(dict_file_length); | |
367 | if (dict_stream == NULL) { | |
368 | printf("Failed to allocate memory to read in dictionary file\n"); | |
369 | fclose(dict_file); | |
370 | return 1; | |
371 | } | |
372 | if (fread(dict_stream, 1, dict_file_length, dict_file) != dict_file_length) { | |
373 | printf("Error occurred when reading dictionary file"); | |
374 | fclose(dict_file); | |
375 | free(dict_stream); | |
376 | return 1; | |
377 | } | |
378 | isal_update_histogram(dict_stream, dict_file_length, &histogram); | |
379 | ||
380 | printf("Read %ld bytes of dictionary file %s\n", dict_file_length, argv[2]); | |
381 | argi += 2; | |
382 | fclose(dict_file); | |
383 | free(dict_stream); | |
384 | } | |
385 | ||
249 | 386 | memset(&histogram, 0, sizeof(histogram)); /* Initialize histograms. */ |
250 | 387 | |
251 | while (argc > 1) { | |
252 | printf("Processing %s\n", argv[argc - 1]); | |
253 | file = fopen(argv[argc - 1], "r"); | |
388 | while (argi < argc) { | |
389 | printf("Processing %s\n", argv[argi]); | |
390 | file = fopen(argv[argi], "r"); | |
254 | 391 | if (file == NULL) { |
255 | 392 | printf("Error opening file\n"); |
256 | 393 | return 1; |
259 | 396 | file_length = ftell(file); |
260 | 397 | fseek(file, 0, SEEK_SET); |
261 | 398 | file_length -= ftell(file); |
262 | stream = malloc(file_length); | |
399 | stream = malloc(file_length + dict_file_length); | |
263 | 400 | if (stream == NULL) { |
264 | 401 | printf("Failed to allocate memory to read in file\n"); |
265 | 402 | fclose(file); |
266 | 403 | return 1; |
267 | 404 | } |
268 | if (fread(stream, 1, file_length, file) != file_length) { | |
405 | if (dict_file_length > 0) | |
406 | memcpy(stream, dict_stream, dict_file_length); | |
407 | ||
408 | if (fread(&stream[dict_file_length], 1, file_length, file) != file_length) { | |
269 | 409 | printf("Error occurred when reading file"); |
270 | 410 | fclose(file); |
271 | 411 | free(stream); |
274 | 414 | |
275 | 415 | /* Create a histogram of frequency of symbols found in stream to |
276 | 416 | * generate the huffman tree.*/ |
277 | isal_update_histogram(stream, file_length, &histogram); | |
417 | if (0 == dict_file_length) | |
418 | isal_update_histogram(stream, file_length, &histogram); | |
419 | else | |
420 | isal_update_histogram_dict(stream, dict_file_length, file_length, | |
421 | &histogram); | |
278 | 422 | |
279 | 423 | fclose(file); |
280 | 424 | free(stream); |
281 | argc--; | |
425 | argi++; | |
282 | 426 | } |
283 | 427 | |
284 | 428 | isal_create_hufftables(&hufftables, &histogram); |