Merge branch 'master' of gitlab.tw.trendnet.org:data-analytics-tlsh/tlsh
Vic Hargrave
8 years ago
16 | 16 | project(TLSH) |
17 | 17 | |
18 | 18 | set(VERSION_MAJOR 3) |
19 | set(VERSION_MINOR 2) | |
20 | set(VERSION_PATCH 1) | |
19 | set(VERSION_MINOR 4) | |
20 | set(VERSION_PATCH 0) | |
21 | 21 | |
22 | 22 | # TLSH uses only half the counting buckets. |
23 | 23 | # It can use all the buckets now. |
42 | 42 | # write a file with the VERSION information |
43 | 43 | file(REMOVE VERSION) |
44 | 44 | file(WRITE VERSION |
45 | "// This file is generated by cmake. Modify\n" | |
46 | "// CMakeLists.txt to change the VERSION numbers\n" | |
45 | 47 | "TLSH version: ${VERSION_MAJOR}.${VERSION_MINOR}.${VERSION_PATCH} ${TLSH_HASH}, ${TLSH_CHECKSUM}\n") |
46 | 48 | |
47 | 49 | file(REMOVE include/version.h) |
2 | 2 | ======================================= |
3 | 3 | |
4 | 4 | TLSH is a fuzzy matching library. Given a byte stream with a minimum length |
5 | of 512 bytes (and a minimum amount of randomness - see note in Python | |
5 | of 256 bytes (and a minimum amount of randomness - see note in Python | |
6 | 6 | extension below), TLSH generates a hash value which can be used for similarity |
7 | 7 | comparisons. Similar objects will have similar hash values which allows for |
8 | 8 | the detection of similar objects by comparing their hash values. Note that |
26 | 26 | hash values to determine similarity. Run it with no parameters for detailed usage. |
27 | 27 | |
28 | 28 | TLSH has been ported to Java, which can be found at https://github.com/triplecheck/TLSH |
29 | TLSH has been ported to Javascript, which can be found in the js_ext directory | |
29 | 30 | |
30 | 31 | ======================================= |
31 | 32 | Downloading TLSH |
75 | 76 | |
76 | 77 | import tlsh |
77 | 78 | tlsh.hash(data) |
78 | - note that the data must contain at least 512 bytes to generate a hash value and that | |
79 | - note that the data must contain at least 256 bytes to generate a hash value and that | |
79 | 80 | it must have a certain amount of randomness. |
80 | For example, tlsh.hash(str(os.urandom(512))), should always generate a hash. | |
81 | To get the hash value of a file, try tlsh.hash(open(file, 'rb').read()) | |
81 | For example, tlsh.hash(str(os.urandom(256))), should always generate a hash. | |
82 | To get the hash value of a file, try tlsh.hash(open(file, 'rb').read()) | |
82 | 83 | |
83 | 84 | tlsh.diff(h1, h2) |
84 | 85 | tlsh.diffxlen(h1, h2) |
169 | 170 | - Add -version flag to tlsh_unittest to get the version of the tlsh library. |
170 | 171 | 3.2.1 - Pickup fix to hash_py() in py_ext/tlshmodule.cpp |
171 | 172 | (commit da5370bcfdd40dd6a33c877ee87fe3866188cf2d) |
173 | 3.3.0 - Made the minimum data length = 256 for the C version | |
174 | 3.3.1 - Fixed bug introduced by commit 1a8f1c581c8b988ced683ff8e0a0f9c574058df4 | |
175 | which caused a different hash value to be generated if there were multiple | |
176 | calls to Tlsh::update as opposed to a single call to Tlsh::update. | |
177 | 3.4.0 - Add javascript implementation (see directory js_ext) - required for | |
178 | Blackhat presentation - https://www.blackhat.com/us-15/speakers/Sean-Park.html |
0 | str1 = 'This is a test for Lili Diao. This is a string. Hello Hello Hello OPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQ' | |
1 | str2 = 'This is a test for Jon Oliver. This is a string. Hello Hello Hello PQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHI' | |
2 | hash1 = 09F05A198CC69A5A4F0F9380A9EE93F2B927CF42089EA74276DC5F0BB2D34E68114448 | |
3 | hash2 = 301124198C869A5A4F0F9380A9AE92F2B9278F42089EA34272885F0FB2D34E6911444C | |
4 | difference (same strings) = 0 | |
5 | difference (with len) = 121 | |
6 | difference (without len) = 97 | |
7 | Testing Tlsh with multiple update calls | |
8 | hash3 = 09F05A198CC69A5A4F0F9380A9EE93F2B927CF42089EA74276DC5F0BB2D34E68114448 | |
9 | hash4 = 301124198C869A5A4F0F9380A9AE92F2B9278F42089EA34272885F0FB2D34E6911444C | |
10 | Testing Tlsh.fromTlshStr() | |
11 | Recreating tlsh3 from 09F05A198CC69A5A4F0F9380A9EE93F2B927CF42089EA74276DC5F0BB2D34E68114448 | |
12 | hash3 = 09F05A198CC69A5A4F0F9380A9EE93F2B927CF42089EA74276DC5F0BB2D34E68114448 | |
13 | Recreating tlsh4 from 301124198C869A5A4F0F9380A9AE92F2B9278F42089EA34272885F0FB2D34E6911444C | |
14 | hash4 = 301124198C869A5A4F0F9380A9AE92F2B9278F42089EA34272885F0FB2D34E6911444C | |
15 | difference (same strings) = 0 | |
16 | difference (with len) = 121 | |
17 | difference (without len) = 97 |
13 | 13 | if test ! -f ../bin/tlsh_unittest |
14 | 14 | then |
15 | 15 | echoerr "error: (127), you must compile tlsh_unittest" |
16 | popd > /dev/null | |
17 | exit 127 | |
18 | fi | |
19 | ||
20 | if test ! -f ../test/simple_unittest | |
21 | then | |
22 | echoerr "error: (127), you must compile ../test/simple_unittest" | |
16 | 23 | popd > /dev/null |
17 | 24 | exit 127 |
18 | 25 | fi |
177 | 184 | runit |
178 | 185 | runit "-xlen" |
179 | 186 | |
187 | echo "Running simple_unittest" | |
188 | ../test/simple_unittest > $TMP/simple_unittest.out | |
189 | diff --ignore-all-space $TMP/simple_unittest.out exp/simple_unittest_EXP > /dev/null 2>/dev/null | |
190 | if [ $? -ne 0 ]; then | |
191 | echoerr "error: diff $TMP/simple_unittest.out exp/simple_unittest_EXP" | |
192 | popd > /dev/null | |
193 | exit -1 | |
194 | fi | |
195 | ||
196 | echo "passed" | |
197 | ||
180 | 198 | popd > /dev/null |
0 | 0 | #!/bin/sh |
1 | 1 | |
2 | echo "rm -rf bin build lib Testing/tmp test/simple_unittest test/tlsh_version" | |
3 | rm -rf bin build lib Testing/tmp test/simple_unittest test/tlsh_version | |
2 | echo "rm -rf bin build lib Testing/tmp test/simple_unittest test/tlsh_version test/tlsh_unittest" | |
3 | rm -rf bin build lib Testing/tmp test/simple_unittest test/tlsh_version test/tlsh_unittest |
25 | 25 | #ifdef __cplusplus |
26 | 26 | |
27 | 27 | #include "tlsh_impl.h" |
28 | ||
29 | // changed the minimum data length to 256 for version 3.3 | |
30 | #define MIN_DATA_LENGTH 256 | |
28 | 31 | |
29 | 32 | class TLSH_API Tlsh{ |
30 | 33 |
69 | 69 | |
70 | 70 | private: |
71 | 71 | unsigned int *a_bucket; |
72 | unsigned char slide_window[SLIDING_WND_SIZE]; | |
72 | 73 | unsigned int data_len; |
73 | 74 | |
74 | 75 | struct lsh_bin_struct { |
0 | /**************************************************** | |
1 | * This file is generated by cmake. Modify the top | |
2 | * level CMakeLists.txt to change the VERSION numbers | |
3 | ****************************************************/ | |
4 | ||
5 | #define VERSION_MAJOR 3 | |
6 | #define VERSION_MINOR 2 | |
7 | #define VERSION_PATCH 1 | |
8 | #define TLSH_HASH "compact hash" | |
9 | #define TLSH_CHECKSUM "1 byte checksum" |
0 | <!DOCTYPE html> | |
1 | <html> | |
2 | <head> | |
3 | </head> | |
4 | <body> | |
5 | ||
6 | <p>TLSH simpletest demo:</p> | |
7 | <p id="tlsh_simple_test"></p> | |
8 | ||
9 | <script src="tlsh.js"> </script> | |
10 | ||
11 | ||
12 | <script type="text/javascript"> | |
13 | ||
14 | /* | |
15 | * Tester for tlsh.js. | |
16 | * Output to correspond to output from C++ executable, simple_test | |
17 | */ | |
18 | ||
19 | var debug = false; | |
20 | document.getElementById("tlsh_simple_test").innerHTML = "Output to correspond to output from the C++ executable, simple_test"; | |
21 | document.getElementById("tlsh_simple_test").innerHTML += "<br>"; | |
22 | document.getElementById("tlsh_simple_test").innerHTML += "<br>"; | |
23 | ||
24 | /*************************************************************************************************** | |
25 | * Create str1 from 2 substrings | |
26 | */ | |
27 | var str_1_a = "This is a test for Lili Diao. This is a string. Hello Hello Hello "; | |
28 | var str_1_b = "OPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQ"; | |
29 | var str_1 = str_1_a + str_1_b; | |
30 | debug && console.log("str1 = '"+str_1+"'"); | |
31 | document.getElementById("tlsh_simple_test").innerHTML += "str1 = '" + str_1 + "'"; | |
32 | document.getElementById("tlsh_simple_test").innerHTML += "<br>"; | |
33 | ||
34 | /*************************************************************************************************** | |
35 | * Create Tlsh from str1 | |
36 | */ | |
37 | var tlsh1 = new Tlsh(); | |
38 | tlsh1.update(str_1, str_1.length+1); // str_1 includes NULL byte at end in simple_test.cpp, so add 1 to the length | |
39 | tlsh1.finale(); | |
40 | ||
41 | /*************************************************************************************************** | |
42 | * Create str2 from 2 substrings | |
43 | */ | |
44 | var str_2_a = "This is a test for Jon Oliver. This is a string. Hello Hello Hello "; | |
45 | var str_2_b = "PQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHI"; | |
46 | var str_2 = str_2_a + str_2_b; | |
47 | debug && console.log("str2 = '"+str_2+"'"); | |
48 | document.getElementById("tlsh_simple_test").innerHTML += "str2 = '" + str_2 + "'"; | |
49 | document.getElementById("tlsh_simple_test").innerHTML += "<br>"; | |
50 | ||
51 | /*************************************************************************************************** | |
52 | * Create Tlsh from str2 | |
53 | */ | |
54 | var tlsh2 = new Tlsh(); | |
55 | tlsh2.update(str_2, str_2.length+1); // str_2 includes NULL byte at end in simple_test.cpp, so add 1 to the length | |
56 | tlsh2.finale(); | |
57 | var hash2 = tlsh2.hash(); | |
58 | ||
59 | /*************************************************************************************************** | |
60 | * Get differences (with and without length) and output. | |
61 | */ | |
62 | var hash1 = tlsh1.hash(); | |
63 | var diff_same_str = tlsh1.totalDiff(tlsh1); | |
64 | var diff_with_len = tlsh1.totalDiff(tlsh2); | |
65 | var diff_without_len = tlsh1.totalDiff(tlsh2, false); | |
66 | ||
67 | debug && console.log("hash1 = "+hash1); | |
68 | debug && console.log("hash2 = "+hash2); | |
69 | debug && console.log("difference (same strings) = " + diff_same_str); | |
70 | debug && console.log("difference (with len) = " + diff_with_len); | |
71 | debug && console.log("difference (without len) = " + diff_without_len); | |
72 | ||
73 | document.getElementById("tlsh_simple_test").innerHTML += "hash1 = " + hash1; | |
74 | document.getElementById("tlsh_simple_test").innerHTML += "<br>"; | |
75 | document.getElementById("tlsh_simple_test").innerHTML += "hash2 = " + hash2; | |
76 | document.getElementById("tlsh_simple_test").innerHTML += "<br>"; | |
77 | document.getElementById("tlsh_simple_test").innerHTML += "difference (same strings) = " + diff_same_str; | |
78 | document.getElementById("tlsh_simple_test").innerHTML += "<br>"; | |
79 | document.getElementById("tlsh_simple_test").innerHTML += "difference (with len) = " + diff_with_len; | |
80 | document.getElementById("tlsh_simple_test").innerHTML += "<br>"; | |
81 | document.getElementById("tlsh_simple_test").innerHTML += "difference (without len) = " + diff_without_len; | |
82 | document.getElementById("tlsh_simple_test").innerHTML += "<br>"; | |
83 | ||
84 | /*************************************************************************************************** | |
85 | * Create Tlsh from substrings of str1 and verify that the hash values are the same | |
86 | */ | |
87 | debug && console.log("Testing Tlsh with multiple update calls"); | |
88 | document.getElementById("tlsh_simple_test").innerHTML += "Testing Tlsh with multiple update calls"; | |
89 | document.getElementById("tlsh_simple_test").innerHTML += "<br>"; | |
90 | var tlsh3 = new Tlsh(); | |
91 | tlsh3.update(str_1_a); | |
92 | tlsh3.update(str_1_b, str_1_b.length+1); | |
93 | tlsh3.finale(); | |
94 | var hash3 = tlsh3.hash(); | |
95 | if (hash1 != hash3) { | |
96 | document.getElementById("tlsh_simple_test").innerHTML = "ERROR: tlsh1 != tlsh3"; | |
97 | throw("ERROR: tlsh1 != tlsh3"); | |
98 | } | |
99 | debug && console.log("hash3 = " + hash4); | |
100 | document.getElementById("tlsh_simple_test").innerHTML += "hash3 = " + hash3; | |
101 | document.getElementById("tlsh_simple_test").innerHTML += "<br>"; | |
102 | ||
103 | /*************************************************************************************************** | |
104 | * Create Tlsh from substrings of str2 and verify that the hash values are the same | |
105 | */ | |
106 | var tlsh4 = new Tlsh(); | |
107 | tlsh4.update(str_2_a); | |
108 | tlsh4.finale(str_2_b, str_2_b.length + 1); | |
109 | var hash4 = tlsh4.hash(); | |
110 | if (hash2 != hash4) { | |
111 | document.getElementById("tlsh_simple_test").innerHTML = "ERROR: tlsh2 != tlsh4"; | |
112 | throw("ERROR: tlsh2 != tlsh4"); | |
113 | } | |
114 | debug && console.log("hash4 = " + hash4); | |
115 | document.getElementById("tlsh_simple_test").innerHTML += "hash4 = " + hash4; | |
116 | document.getElementById("tlsh_simple_test").innerHTML += "<br>"; | |
117 | ||
118 | /*************************************************************************************************** | |
119 | * Create Tlsh fromTlshStr() method of hash1 and verify that the hash values are the same | |
120 | */ | |
121 | debug && console.log("Testing Tlsh.fromTlshStr()"); | |
122 | debug && console.log("Recreating tlsh3 from "+hash1); | |
123 | document.getElementById("tlsh_simple_test").innerHTML += "Testing Tlsh.fromTlshStr()"; | |
124 | document.getElementById("tlsh_simple_test").innerHTML += "<br>"; | |
125 | document.getElementById("tlsh_simple_test").innerHTML += "Recreating tlsh3 from " + hash1; | |
126 | document.getElementById("tlsh_simple_test").innerHTML += "<br>"; | |
127 | ||
128 | tlsh3.reset(); | |
129 | tlsh3.fromTlshStr(hash1); | |
130 | hash3 = tlsh3.hash(); | |
131 | if (hash1 != hash3) { | |
132 | document.getElementById("tlsh_simple_test").innerHTML = "ERROR: hash1 != tlsh3.hash()"; | |
133 | throw("ERROR: hash1 != tlsh3.hash()"); | |
134 | } | |
135 | ||
136 | debug && console.log("hash3 = " + hash3); | |
137 | document.getElementById("tlsh_simple_test").innerHTML += "hash3 = " + hash3; | |
138 | document.getElementById("tlsh_simple_test").innerHTML += "<br>"; | |
139 | ||
140 | debug && console.log("Recreating tlsh4 from "+hash1); | |
141 | document.getElementById("tlsh_simple_test").innerHTML += "Recreating tlsh4 from " + hash2; | |
142 | document.getElementById("tlsh_simple_test").innerHTML += "<br>"; | |
143 | ||
144 | tlsh4.reset(); | |
145 | tlsh4.fromTlshStr(hash2); | |
146 | debug && console.log("hash4 = " + hash4); | |
147 | document.getElementById("tlsh_simple_test").innerHTML += "hash4 = " + hash4; | |
148 | document.getElementById("tlsh_simple_test").innerHTML += "<br>"; | |
149 | ||
150 | diff_with_len = tlsh3.totalDiff(tlsh4); | |
151 | diff_without_len = tlsh3.totalDiff(tlsh4, false); | |
152 | diff_same_str = tlsh3.totalDiff(tlsh3); | |
153 | debug && console.log("difference (same strings) = " + diff_same_str); | |
154 | debug && console.log("difference (with len) = " + diff_with_len); | |
155 | debug && console.log("difference (without len) = " + diff_without_len); | |
156 | ||
157 | document.getElementById("tlsh_simple_test").innerHTML += "difference (same strings) = " + diff_same_str; | |
158 | document.getElementById("tlsh_simple_test").innerHTML += "<br>"; | |
159 | document.getElementById("tlsh_simple_test").innerHTML += "difference (with len) = " + diff_with_len; | |
160 | document.getElementById("tlsh_simple_test").innerHTML += "<br>"; | |
161 | document.getElementById("tlsh_simple_test").innerHTML += "difference (without len) = " + diff_without_len; | |
162 | document.getElementById("tlsh_simple_test").innerHTML += "<br>"; | |
163 | ||
164 | </script> | |
165 | </body> | |
166 | </html> | |
167 |
0 | /* | |
1 | * Copyright 2013 Trend Micro Incorporated | |
2 | * | |
3 | * Licensed under the Apache License, Version 2.0 (the "License"); | |
4 | * you may not use this file except in compliance with the License. | |
5 | * You may obtain a copy of the License at | |
6 | * | |
7 | * http://www.apache.org/licenses/LICENSE-2.0 | |
8 | * | |
9 | * Unless required by applicable law or agreed to in writing, software | |
10 | * distributed under the License is distributed on an "AS IS" BASIS, | |
11 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
12 | * See the License for the specific language governing permissions and | |
13 | * limitations under the License. | |
14 | */ | |
15 | ||
16 | /* | |
17 | * Port of C++ implementation tlsh to javascript. | |
18 | * | |
19 | * Construct Tlsh object with methods: | |
20 | * update | |
21 | * finale | |
22 | * fromTlshStr | |
23 | * reset | |
24 | * hash | |
25 | * totalDiff | |
26 | * | |
27 | * See tlsh.html for example use. | |
28 | */ | |
29 | ||
30 | var debug = false; | |
31 | /////////////////////////////////////////////////////////////////////////////////// | |
32 | // From tlsh_util.cpp | |
33 | var v_table = new Uint8Array([ | |
34 | 1, 87, 49, 12, 176, 178, 102, 166, 121, 193, 6, 84, 249, 230, 44, 163, | |
35 | 14, 197, 213, 181, 161, 85, 218, 80, 64, 239, 24, 226, 236, 142, 38, 200, | |
36 | 110, 177, 104, 103, 141, 253, 255, 50, 77, 101, 81, 18, 45, 96, 31, 222, | |
37 | 25, 107, 190, 70, 86, 237, 240, 34, 72, 242, 20, 214, 244, 227, 149, 235, | |
38 | 97, 234, 57, 22, 60, 250, 82, 175, 208, 5, 127, 199, 111, 62, 135, 248, | |
39 | 174, 169, 211, 58, 66, 154, 106, 195, 245, 171, 17, 187, 182, 179, 0, 243, | |
40 | 132, 56, 148, 75, 128, 133, 158, 100, 130, 126, 91, 13, 153, 246, 216, 219, | |
41 | 119, 68, 223, 78, 83, 88, 201, 99, 122, 11, 92, 32, 136, 114, 52, 10, | |
42 | 138, 30, 48, 183, 156, 35, 61, 26, 143, 74, 251, 94, 129, 162, 63, 152, | |
43 | 170, 7, 115, 167, 241, 206, 3, 150, 55, 59, 151, 220, 90, 53, 23, 131, | |
44 | 125, 173, 15, 238, 79, 95, 89, 16, 105, 137, 225, 224, 217, 160, 37, 123, | |
45 | 118, 73, 2, 157, 46, 116, 9, 145, 134, 228, 207, 212, 202, 215, 69, 229, | |
46 | 27, 188, 67, 124, 168, 252, 42, 4, 29, 108, 21, 247, 19, 205, 39, 203, | |
47 | 233, 40, 186, 147, 198, 192, 155, 33, 164, 191, 98, 204, 165, 180, 117, 76, | |
48 | 140, 36, 210, 172, 41, 54, 159, 8, 185, 232, 113, 196, 231, 47, 146, 120, | |
49 | 51, 65, 28, 144, 254, 221, 93, 189, 194, 139, 112, 43, 71, 109, 184, 209]); | |
50 | ||
51 | function b_mapping(salt, i, j, k) | |
52 | { | |
53 | var h = 0; | |
54 | ||
55 | h = v_table[h ^ salt]; | |
56 | h = v_table[h ^ i]; | |
57 | h = v_table[h ^ j]; | |
58 | h = v_table[h ^ k]; | |
59 | return h; | |
60 | } | |
61 | ||
62 | var LOG_1_5 = 0.4054651; | |
63 | var LOG_1_3 = 0.26236426; | |
64 | var LOG_1_1 = 0.095310180; | |
65 | ||
66 | function l_capturing(len) { | |
67 | var i; | |
68 | if( len <= 656 ) { | |
69 | i = Math.floor( Math.log(len) / LOG_1_5 ); | |
70 | } else if( len <= 3199 ) { | |
71 | i = Math.floor( Math.log(len) / LOG_1_3 - 8.72777 ); | |
72 | } else { | |
73 | i = Math.floor( Math.log(len) / LOG_1_1 - 62.5472 ); | |
74 | } | |
75 | ||
76 | return (i & 0xFF); | |
77 | } | |
78 | ||
79 | function swap_byte( i ) | |
80 | { | |
81 | var byte = 0; | |
82 | byte = ((i & 0xF0) >> 4) & 0x0F; | |
83 | byte |= ((i & 0x0F) << 4) & 0xF0; | |
84 | return byte; | |
85 | } | |
86 | ||
87 | function to_hex( data, len ) | |
88 | { | |
89 | // Use TLSH.java implementation for to_hex | |
90 | var s = new String; | |
91 | for (var i=0; i<len; i++) { | |
92 | if (data[i] < 16) { | |
93 | s = s.concat("0"); | |
94 | } | |
95 | debug && console.log("to_hex: "+data[i]); | |
96 | s = s.concat(data[i].toString(16).toUpperCase()); | |
97 | } | |
98 | ||
99 | return s; | |
100 | } | |
101 | ||
102 | function from_hex( str ) | |
103 | { | |
104 | // Use TLSH.java implementation for from_hex | |
105 | var ret = new Uint8Array(str.length / 2); // unsigned char array} | |
106 | for (var i = 0; i < str.length; i += 2) { | |
107 | ret[i / 2] = parseInt(str.substring(i, i + 2), 16); | |
108 | } | |
109 | return ret; | |
110 | } | |
111 | ||
112 | function mod_diff(x, y, R) | |
113 | { | |
114 | var dl = 0; | |
115 | var dr = 0; | |
116 | if ( y > x ){ | |
117 | dl = y - x; | |
118 | dr = x + R - y; | |
119 | }else{ | |
120 | dl = x - y; | |
121 | dr = y + R - x; | |
122 | } | |
123 | return (dl > dr ? dr : dl); | |
124 | } | |
125 | ||
126 | // Use generateTable() from TLSH.java implementation | |
127 | function generateTable() | |
128 | { | |
129 | var arraySize = 256; | |
130 | var result = new Array(arraySize); | |
131 | for (var i=0; i<result.length; i++) | |
132 | { | |
133 | result[i] = new Uint8Array(arraySize); | |
134 | } | |
135 | ||
136 | for (var i = 0; i < arraySize; i++) { | |
137 | for (var j = 0; j < arraySize; j++) { | |
138 | var x = i, y = j, d, diff = 0; | |
139 | d = Math.abs(x % 4 - y % 4); diff += (d == 3 ? 6 : d); | |
140 | x = Math.floor(x / 4); | |
141 | y = Math.floor(y / 4); | |
142 | ||
143 | d = Math.abs(x % 4 - y % 4); diff += (d == 3 ? 6 : d); | |
144 | x = Math.floor(x / 4); | |
145 | y = Math.floor(y / 4); | |
146 | ||
147 | d = Math.abs(x % 4 - y % 4); diff += (d == 3 ? 6 : d); | |
148 | x = Math.floor(x / 4); | |
149 | y = Math.floor(y / 4); | |
150 | ||
151 | d = Math.abs(x % 4 - y % 4); diff += (d == 3 ? 6 : d); | |
152 | result[i][j] = diff; | |
153 | } | |
154 | } | |
155 | return result; | |
156 | } | |
157 | ||
158 | var bit_pairs_diff_table = generateTable(); | |
159 | ||
160 | function h_distance( len, x, y) | |
161 | { | |
162 | var diff = 0; | |
163 | for( var i=0; i<len; i++ ){ | |
164 | debug && console.log("bit_pairs_diff_table["+x[i]+"]["+y[i]+"]="+bit_pairs_diff_table[x[i]][y[i]]); | |
165 | diff += bit_pairs_diff_table[ x[i] ][ y[i] ]; | |
166 | } | |
167 | debug && console.log("h_distance returning "+diff); | |
168 | return diff; | |
169 | } | |
170 | ||
171 | /////////////////////////////////////////////////////////////////////////////////// | |
172 | // from C #defines in tlsh_impl.h and tlsh_impl.cpp | |
173 | var SLIDING_WND_SIZE = 5; | |
174 | var RNG_SIZE = SLIDING_WND_SIZE; | |
175 | function RNG_IDX(i) { return (i+RNG_SIZE) % RNG_SIZE; } | |
176 | var TLSH_CHECKSUM_LEN = 1; | |
177 | var BUCKETS = 256; | |
178 | var EFF_BUCKETS = 128; | |
179 | var CODE_SIZE = 32; // 128 * 2 bits = 32 bytes | |
180 | var TLSH_STRING_LEN = 70; // 2 + 1 + 32 bytes = 70 hexidecimal chars | |
181 | var RANGE_LVALUE = 256; | |
182 | var RANGE_QRATIO = 16; | |
183 | ||
184 | function SWAP_UINT(buf, x, y) | |
185 | { | |
186 | var int_tmp = buf.bucket_copy[x]; | |
187 | buf.bucket_copy[x] = buf.bucket_copy[y]; | |
188 | buf.bucket_copy[y] = int_tmp; | |
189 | } | |
190 | ||
191 | /////////////////////////////////////////////////////////////////////////////////// | |
192 | // TLSH member and non-member functions - from tlsh_impl.cpp | |
193 | ||
194 | function partition(buf, left, right) | |
195 | { | |
196 | if( left == right ) { | |
197 | return left; | |
198 | } | |
199 | if( left+1 == right ) { | |
200 | if( buf.bucket_copy[left] > buf.bucket_copy[right] ) { | |
201 | SWAP_UINT( buf, left, right ); | |
202 | } | |
203 | return left; | |
204 | } | |
205 | ||
206 | var ret = left; | |
207 | var pivot = (left + right)>>1; | |
208 | ||
209 | var val = buf.bucket_copy[pivot]; | |
210 | ||
211 | buf.bucket_copy[pivot] = buf.bucket_copy[right]; | |
212 | buf.bucket_copy[right] = val; | |
213 | ||
214 | for( var i = left; i < right; i++ ) { | |
215 | if( buf.bucket_copy[i] < val ) { | |
216 | SWAP_UINT( buf, ret, i ); | |
217 | ret++; | |
218 | } | |
219 | } | |
220 | buf.bucket_copy[right] = buf.bucket_copy[ret]; | |
221 | buf.bucket_copy[ret] = val; | |
222 | ||
223 | return ret; | |
224 | } | |
225 | ||
226 | function find_quartile(tlsh, quartiles) | |
227 | { | |
228 | var buf = new Object(); | |
229 | buf.bucket_copy = new Uint32Array(EFF_BUCKETS); | |
230 | var short_cut_left = new Uint32Array(EFF_BUCKETS); | |
231 | var short_cut_right = new Uint32Array(EFF_BUCKETS); | |
232 | var spl = 0; | |
233 | var spr = 0; | |
234 | var p1 = EFF_BUCKETS/4-1; | |
235 | var p2 = EFF_BUCKETS/2-1; | |
236 | var p3 = EFF_BUCKETS-EFF_BUCKETS/4-1; | |
237 | var end = EFF_BUCKETS-1; | |
238 | ||
239 | for(var i=0; i<=end; i++) { | |
240 | buf.bucket_copy[i] = tlsh.a_bucket[i]; | |
241 | } | |
242 | ||
243 | for( var l=0, r=end; ; ) { | |
244 | var ret = partition( buf, l, r ); | |
245 | if( ret > p2 ) { | |
246 | r = ret - 1; | |
247 | short_cut_right[spr] = ret; | |
248 | spr++; | |
249 | } else if( ret < p2 ){ | |
250 | l = ret + 1; | |
251 | short_cut_left[spl] = ret; | |
252 | spl++; | |
253 | } else { | |
254 | quartiles.q2 = buf.bucket_copy[p2]; | |
255 | break; | |
256 | } | |
257 | } | |
258 | ||
259 | short_cut_left[spl] = p2-1; | |
260 | short_cut_right[spr] = p2+1; | |
261 | ||
262 | for( var i=0, l=0; i<=spl; i++ ) { | |
263 | var r = short_cut_left[i]; | |
264 | if( r > p1 ) { | |
265 | for( ; ; ) { | |
266 | var ret = partition( buf, l, r ); | |
267 | if( ret > p1 ) { | |
268 | r = ret-1; | |
269 | } else if( ret < p1 ) { | |
270 | l = ret+1; | |
271 | } else { | |
272 | quartiles.q1 = buf.bucket_copy[p1]; | |
273 | break; | |
274 | } | |
275 | } | |
276 | break; | |
277 | } else if( r < p1 ) { | |
278 | l = r; | |
279 | } else { | |
280 | quartiles.q1 = buf.bucket_copy[p1]; | |
281 | break; | |
282 | } | |
283 | } | |
284 | ||
285 | for( var i=0, r=end; i<=spr; i++ ) { | |
286 | var l = short_cut_right[i]; | |
287 | if( l < p3 ) { | |
288 | for( ; ; ) { | |
289 | var ret = partition( buf, l, r ); | |
290 | if( ret > p3 ) { | |
291 | r = ret-1; | |
292 | } else if( ret < p3 ) { | |
293 | l = ret+1; | |
294 | } else { | |
295 | quartiles.q3 = buf.bucket_copy[p3]; | |
296 | break; | |
297 | } | |
298 | } | |
299 | break; | |
300 | } else if( l > p3 ) { | |
301 | r = l; | |
302 | } else { | |
303 | quartiles.q3 = buf.bucket_copy[p3]; | |
304 | break; | |
305 | } | |
306 | } | |
307 | } | |
308 | ||
309 | /////////////////////////////////////////////////////////////////////////////////// | |
310 | // Definition of tlsh object | |
311 | var Tlsh = function () | |
312 | { | |
313 | this.checksum = new Uint8Array(TLSH_CHECKSUM_LEN); // unsigned char array | |
314 | this.slide_window = new Uint8Array(SLIDING_WND_SIZE); | |
315 | this.a_bucket = new Uint32Array(BUCKETS); // unsigned int array | |
316 | this.data_len = 0; | |
317 | this.tmp_code = new Uint8Array(CODE_SIZE); | |
318 | this.Lvalue = 0; | |
319 | this.Q = 0; | |
320 | this.lsh_code = new String; | |
321 | this.lsh_code_valid = false; | |
322 | }; | |
323 | ||
324 | // Use get/setQLo() and get/setQHi() from TLSH.java implementation | |
325 | function getQLo(Q) | |
326 | { | |
327 | return (Q & 0x0F) | |
328 | } | |
329 | ||
330 | function getQHi(Q) | |
331 | { | |
332 | return ((Q & 0xF0) >> 4); | |
333 | } | |
334 | ||
335 | function setQLo(Q, x) | |
336 | { | |
337 | return (Q & 0xF0) | (x & 0x0F); | |
338 | } | |
339 | ||
340 | function setQHi(Q, x) | |
341 | { | |
342 | return (Q & 0x0F) | ((x & 0x0F) << 4); | |
343 | } | |
344 | ||
345 | // Allow caller to pass in length in case there are embedded null characters, as there | |
346 | // are in strings str_1 and str_2 (see simple_test.cpp) | |
347 | // | |
348 | // length parameter defaults to str.length | |
349 | Tlsh.prototype.update = function (str, length) | |
350 | { | |
351 | length = typeof length !== 'undefined' ? length : str.length; | |
352 | ||
353 | var data = []; | |
354 | for(var i = 0; i < length; i++) { | |
355 | var code = str.charCodeAt(i); | |
356 | if (code > 255) { | |
357 | alert("Unexpected " + str[i] + " has value " + code + " which is too large"); | |
358 | return; | |
359 | } | |
360 | // Since charCodeAt returns between 0~65536, simply save every character as 2-bytes | |
361 | // data.push(code & 0xff00, code & 0xff); | |
362 | data.push(code & 0xff); | |
363 | } | |
364 | ||
365 | if (length != data.length) | |
366 | { | |
367 | alert("Unexpected string length:" + length + " is not equal to value unsigned char length: " + data.length); | |
368 | return; | |
369 | } | |
370 | ||
371 | var j = this.data_len % RNG_SIZE; | |
372 | var fed_len = this.data_len; | |
373 | ||
374 | for( var i=0; i<length; i++, fed_len++, j=RNG_IDX(j+1) ) { | |
375 | this.slide_window[j] = data[i]; | |
376 | debug && console.log("slide_window["+j+"]="+this.slide_window[j]); | |
377 | ||
378 | if ( fed_len >= 4 ) { | |
379 | //only calculate when input >= 5 bytes | |
380 | var j_1 = RNG_IDX(j-1); | |
381 | var j_2 = RNG_IDX(j-2); | |
382 | var j_3 = RNG_IDX(j-3); | |
383 | var j_4 = RNG_IDX(j-4); | |
384 | ||
385 | for (var k = 0; k < TLSH_CHECKSUM_LEN; k++) { | |
386 | if (k == 0) { | |
387 | this.checksum[k] = b_mapping(0, this.slide_window[j], this.slide_window[j_1], this.checksum[k]); | |
388 | debug && console.log("tlsh.checksum["+k+"]="+this.checksum[k]); | |
389 | } | |
390 | else { | |
391 | // use calculated 1 byte checksums to expand the total checksum to 3 bytes | |
392 | this.checksum[k] = b_mapping(this.checksum[k-1], this.slide_window[j], this.slide_window[j_1], this.checksum[k]); | |
393 | } | |
394 | } | |
395 | ||
396 | var r; | |
397 | r = b_mapping(2, this.slide_window[j], this.slide_window[j_1], this.slide_window[j_2]); | |
398 | r = b_mapping(2, this.slide_window[j], this.slide_window[j_1], this.slide_window[j_2]); | |
399 | r = b_mapping(2, this.slide_window[j], this.slide_window[j_1], this.slide_window[j_2]); | |
400 | ||
401 | ||
402 | this.a_bucket[r]++; | |
403 | r = b_mapping(3, this.slide_window[j], this.slide_window[j_1], this.slide_window[j_3]); | |
404 | this.a_bucket[r]++; | |
405 | r = b_mapping(5, this.slide_window[j], this.slide_window[j_2], this.slide_window[j_3]); | |
406 | this.a_bucket[r]++; | |
407 | r = b_mapping(7, this.slide_window[j], this.slide_window[j_2], this.slide_window[j_4]); | |
408 | this.a_bucket[r]++; | |
409 | r = b_mapping(11, this.slide_window[j], this.slide_window[j_1], this.slide_window[j_4]); | |
410 | this.a_bucket[r]++; | |
411 | r = b_mapping(13, this.slide_window[j], this.slide_window[j_3], this.slide_window[j_4]); | |
412 | this.a_bucket[r]++; | |
413 | } | |
414 | } | |
415 | this.data_len += length; | |
416 | } | |
417 | ||
418 | // final is a reserved word | |
419 | Tlsh.prototype.finale = function (str, length) | |
420 | { | |
421 | if (typeof str !== 'undefined') { | |
422 | this.update(str, length); | |
423 | } | |
424 | ||
425 | // incoming data must more than or equal to 512 bytes | |
426 | if (this.data_len < 256) { | |
427 | alert("ERROR: length too small - " + this.data_len); // + ")"); | |
428 | } | |
429 | ||
430 | var quartiles = new Object(); | |
431 | quartiles.q1 = 0; | |
432 | quartiles.q2 = 0; | |
433 | quartiles.q3 = 0; | |
434 | find_quartile(this, quartiles); | |
435 | ||
436 | // buckets must be more than 50% non-zero | |
437 | var nonzero = 0; | |
438 | for(var i=0; i<CODE_SIZE; i++) { | |
439 | for(var j=0; j<4; j++) { | |
440 | if (this.a_bucket[4*i + j] > 0) { | |
441 | nonzero++; | |
442 | } | |
443 | } | |
444 | } | |
445 | if (nonzero <= 4*CODE_SIZE/2) { | |
446 | alert("ERROR: not enought variation in input - " + nonzero + " < " + 4*CODE_SIZE/2); | |
447 | } | |
448 | ||
449 | for(var i=0; i<CODE_SIZE; i++) { | |
450 | var h=0; | |
451 | for(var j=0; j<4; j++) { | |
452 | var k = this.a_bucket[4*i + j]; | |
453 | if( quartiles.q3 < k ) { | |
454 | h += 3 << (j*2); // leave the optimization j*2 = j<<1 or j*2 = j+j for compiler | |
455 | } else if( quartiles.q2 < k ) { | |
456 | h += 2 << (j*2); | |
457 | } else if( quartiles.q1 < k ) { | |
458 | h += 1 << (j*2); | |
459 | } | |
460 | } | |
461 | this.tmp_code[i] = h; | |
462 | } | |
463 | ||
464 | this.Lvalue = l_capturing(this.data_len); | |
465 | this.Q = setQLo(this.Q, ((quartiles.q1*100)/quartiles.q3) % 16); | |
466 | this.Q = setQHi(this.Q, ((quartiles.q2*100)/quartiles.q3) % 16); | |
467 | this.lsh_code_valid = true; | |
468 | } | |
469 | ||
470 | Tlsh.prototype.hash = function () | |
471 | { | |
472 | if (this.lsh_code_valid == false) { | |
473 | return "ERROR IN PROCESSING"; | |
474 | } | |
475 | ||
476 | var tmp = new Object(); | |
477 | tmp.checksum = new Uint8Array(TLSH_CHECKSUM_LEN); | |
478 | tmp.Lvalue = 0; | |
479 | tmp.Q = 0; | |
480 | tmp.tmp_code = new Uint8Array(CODE_SIZE); | |
481 | ||
482 | for (var k = 0; k < TLSH_CHECKSUM_LEN; k++) { | |
483 | tmp.checksum[k] = swap_byte( this.checksum[k] ); | |
484 | debug && console.log("After swap_byte for checksum: tmp.checksum:"+tmp.checksum[k]+", tlsh.checksum:"+this.checksum[k]); | |
485 | } | |
486 | tmp.Lvalue = swap_byte( this.Lvalue ); | |
487 | tmp.Q = swap_byte( this.Q ); | |
488 | debug && console.log("After swap_byte for Q: tmp.Q:"+tmp.Q+", tlsh.Q:"+this.Q); | |
489 | for( var i=0; i < CODE_SIZE; i++ ){ | |
490 | tmp.tmp_code[i] = this.tmp_code[CODE_SIZE-1-i]; | |
491 | debug && console.log("tmp.tmp_code["+i+"]:"+tmp.tmp_code[i]); | |
492 | } | |
493 | ||
494 | this.lsh_code = to_hex(tmp.checksum, TLSH_CHECKSUM_LEN); | |
495 | ||
496 | tmpArray = new Uint8Array(1); | |
497 | tmpArray[0] = tmp.Lvalue; | |
498 | this.lsh_code = this.lsh_code.concat(to_hex(tmpArray, 1)); | |
499 | ||
500 | tmpArray[0] = tmp.Q; | |
501 | this.lsh_code = this.lsh_code.concat(to_hex(tmpArray, 1)); | |
502 | this.lsh_code = this.lsh_code.concat(to_hex(tmp.tmp_code, CODE_SIZE)); | |
503 | return this.lsh_code; | |
504 | } | |
505 | ||
506 | Tlsh.prototype.reset = function () | |
507 | { | |
508 | this.checksum = new Uint8Array(TLSH_CHECKSUM_LEN); | |
509 | this.slide_window = new Uint8Array(SLIDING_WND_SIZE); | |
510 | this.a_bucket = new Uint32Array(BUCKETS); | |
511 | this.data_len = 0; | |
512 | this.tmp_code = new Uint8Array(CODE_SIZE); | |
513 | this.Lvalue = 0; | |
514 | this.Q = 0; | |
515 | this.lsh_code = new String; | |
516 | this.lsh_code_valid = false; | |
517 | } | |
518 | ||
519 | // len_diff defaults to true | |
520 | Tlsh.prototype.totalDiff = function(other, len_diff) | |
521 | { | |
522 | if (this == other) | |
523 | { | |
524 | return 0; | |
525 | } | |
526 | ||
527 | len_diff = typeof len_diff !== 'undefined' ? len_diff : true; | |
528 | var diff = 0; | |
529 | ||
530 | if (len_diff) { | |
531 | var ldiff = mod_diff( this.Lvalue, other.Lvalue, RANGE_LVALUE); | |
532 | if ( ldiff == 0 ) | |
533 | diff = 0; | |
534 | else if ( ldiff == 1 ) | |
535 | diff = 1; | |
536 | else | |
537 | diff += ldiff*12; | |
538 | } | |
539 | ||
540 | var q1diff = mod_diff( getQLo(this.Q), getQLo(other.Q), RANGE_QRATIO); | |
541 | if ( q1diff <= 1 ) | |
542 | diff += q1diff; | |
543 | else | |
544 | diff += (q1diff-1)*12; | |
545 | ||
546 | var q2diff = mod_diff( getQHi(this.Q), getQHi(other.Q), RANGE_QRATIO); | |
547 | if ( q2diff <= 1) | |
548 | diff += q2diff; | |
549 | else | |
550 | diff += (q2diff-1)*12; | |
551 | ||
552 | for (var k = 0; k < TLSH_CHECKSUM_LEN; k++) { | |
553 | if (this.checksum[k] != other.checksum[k] ) { | |
554 | diff ++; | |
555 | break; | |
556 | } | |
557 | } | |
558 | ||
559 | diff += h_distance( CODE_SIZE, this.tmp_code, other.tmp_code ); | |
560 | ||
561 | return diff; | |
562 | } | |
563 | ||
564 | Tlsh.prototype.fromTlshStr = function(str) | |
565 | { | |
566 | if (str.length != TLSH_STRING_LEN) { | |
567 | alert("Tlsh.fromTlshStr() - string has wrong length (" + str.length + " != " + TLSH_STRING_LEN + ")"); | |
568 | return; | |
569 | } | |
570 | for( var i=0; i < TLSH_STRING_LEN; i++ ) { | |
571 | if (!( | |
572 | (str[i] >= '0' && str[i] <= '9') || | |
573 | (str[i] >= 'A' && str[i] <= 'F') || | |
574 | (str[i] >= 'a' && str[i] <= 'f') )) | |
575 | { | |
576 | alert("Tlsh.fromTlshStr() - string has invalid (non-hex) characters"); | |
577 | return; | |
578 | } | |
579 | } | |
580 | ||
581 | var tmp = from_hex(str); | |
582 | // Order of assignment is based on order of fields in lsh_bin | |
583 | // Also note that TLSH_CHECKSUM_LEN is 1 | |
584 | var i = 0; | |
585 | this.checksum[i] = swap_byte( tmp[i++] ); | |
586 | this.Lvalue = swap_byte( tmp[i++] ); | |
587 | this.Q = swap_byte( tmp[i++] ); | |
588 | ||
589 | for( var j=0; j < CODE_SIZE; j++ ) { | |
590 | this.tmp_code[j] = (tmp[i+CODE_SIZE-1-j]); | |
591 | } | |
592 | this.lsh_code_valid = true; | |
593 | } |
0 | #include <math.h> | |
1 | #include <stdlib.h> | |
2 | #include <stdio.h> | |
3 | ||
4 | ///////////////////////////////////////////////////////////////////////////// | |
5 | // Tlsh.java code to generate the bit_pairs_diff_table in tlsh_util.cpp | |
6 | ||
7 | int result[256][256]; | |
8 | ||
9 | void generateTable() | |
10 | { | |
11 | for (int i = 0; i < 256; i++) { | |
12 | for (int j = 0; j < 256; j++) { | |
13 | int x = i, y = j, d, diff = 0; | |
14 | d = abs(x % 4 - y % 4); diff += (d == 3 ? 6 : d); | |
15 | x /= 4; y /= 4; | |
16 | d = abs(x % 4 - y % 4); diff += (d == 3 ? 6 : d); | |
17 | x /= 4; y /= 4; | |
18 | d = abs(x % 4 - y % 4); diff += (d == 3 ? 6 : d); | |
19 | x /= 4; y /= 4; | |
20 | d = abs(x % 4 - y % 4); diff += (d == 3 ? 6 : d); | |
21 | result[i][j] = diff; | |
22 | } | |
23 | } | |
24 | } | |
25 | ||
26 | ///////////////////////////////////////////////////////////////////////////// | |
27 | // Jon Oliver's functions to generate bit_pairs_diff_table | |
28 | ||
29 | static int pairbit_diff(int pairb, int opairb) | |
30 | { | |
31 | int diff = abs(pairb - opairb); | |
32 | if (diff <= 1) | |
33 | return(diff); | |
34 | else if (diff == 2) | |
35 | return(2); | |
36 | return(6); | |
37 | } | |
38 | ||
39 | int byte_diff(unsigned char bv, unsigned char obv) | |
40 | { | |
41 | int h1 = (unsigned char) bv / 16; | |
42 | int oh1 = (unsigned char) obv / 16; | |
43 | int h2 = (unsigned char) bv % 16; | |
44 | int oh2 = (unsigned char) obv % 16; | |
45 | int p1 = h1 / 4; | |
46 | int op1 = oh1 / 4; | |
47 | int p2 = h1 % 4; | |
48 | int op2 = oh1 % 4; | |
49 | int p3 = h2 / 4; | |
50 | int op3 = oh2 / 4; | |
51 | int p4 = h2 % 4; | |
52 | int op4 = oh2 % 4; | |
53 | int diff = 0; | |
54 | diff = diff + pairbit_diff(p1, op1); | |
55 | diff = diff + pairbit_diff(p2, op2); | |
56 | diff = diff + pairbit_diff(p3, op3); | |
57 | diff = diff + pairbit_diff(p4, op4); | |
58 | return(diff); | |
59 | } | |
60 | ||
61 | ///////////////////////////////////////////////////////////////////////////// | |
62 | // main() function to verify Tlsh.java and Jon's implementations are equalivant, | |
63 | // and to output the static unsigned char bit_pairs_diff_table in | |
64 | // tlsh_util.cpp. | |
65 | int main() | |
66 | { | |
67 | int x; | |
68 | int y; | |
69 | generateTable(); | |
70 | for (x=0; x<256; x++) { | |
71 | printf("{\n"); | |
72 | for (y=0; y<256; y++) { | |
73 | int z = byte_diff((unsigned char) x, (unsigned char) y); | |
74 | if (z != result[x][y]) { | |
75 | printf("\nWARNING x=%d y=%d z=%d nuno=%d\n", x, y, z, result[x][y]); | |
76 | return -1; | |
77 | } | |
78 | printf("%d", z); | |
79 | if (y < 255) | |
80 | printf(", "); | |
81 | if (y % 16 == 15) | |
82 | printf("\n"); | |
83 | } | |
84 | printf("}"); | |
85 | if (x < 255) | |
86 | printf(","); | |
87 | printf("\n"); | |
88 | } | |
89 | printf("};"); | |
90 | } |
83 | 83 | if ( this == other ) |
84 | 84 | return 0; |
85 | 85 | else |
86 | return (impl.totalDiff(other->impl, len_diff)+1); | |
86 | return (impl.totalDiff(other->impl, len_diff)); | |
87 | 87 | } |
88 | 88 | |
89 | 89 | int Tlsh::fromTlshStr(const char* str) |
41 | 41 | |
42 | 42 | TlshImpl::TlshImpl() : a_bucket(NULL), data_len(0), lsh_code(NULL), lsh_code_valid(false) |
43 | 43 | { |
44 | memset(this->slide_window, 0, sizeof this->slide_window); | |
44 | 45 | memset(&this->lsh_bin, 0, sizeof this->lsh_bin); |
45 | 46 | } |
46 | 47 | |
53 | 54 | void TlshImpl::reset() |
54 | 55 | { |
55 | 56 | delete [] this->a_bucket; this->a_bucket = NULL; |
57 | memset(this->slide_window, 0, sizeof this->slide_window); | |
56 | 58 | delete [] this->lsh_code; this->lsh_code = NULL; |
57 | 59 | memset(&this->lsh_bin, 0, sizeof this->lsh_bin); |
58 | 60 | this->data_len = 0; |
72 | 74 | memset(this->a_bucket, 0, sizeof(int)*BUCKETS); |
73 | 75 | } |
74 | 76 | |
75 | unsigned char slide_window[SLIDING_WND_SIZE]; | |
76 | memset(slide_window, 0, sizeof(slide_window)); | |
77 | 77 | for( unsigned int i=0; i<len; i++, fed_len++, j=RNG_IDX(j+1) ) { |
78 | slide_window[j] = data[i]; | |
78 | this->slide_window[j] = data[i]; | |
79 | 79 | |
80 | 80 | if ( fed_len >= 4 ) { |
81 | 81 | //only calculate when input >= 5 bytes |
86 | 86 | |
87 | 87 | for (int k = 0; k < TLSH_CHECKSUM_LEN; k++) { |
88 | 88 | if (k == 0) { |
89 | this->lsh_bin.checksum[k] = b_mapping(0, slide_window[j], slide_window[j_1], this->lsh_bin.checksum[k]); | |
89 | this->lsh_bin.checksum[k] = b_mapping(0, this->slide_window[j], this->slide_window[j_1], this->lsh_bin.checksum[k]); | |
90 | 90 | } |
91 | 91 | else { |
92 | 92 | // use calculated 1 byte checksums to expand the total checksum to 3 bytes |
93 | this->lsh_bin.checksum[k] = b_mapping(this->lsh_bin.checksum[k-1], slide_window[j], slide_window[j_1], this->lsh_bin.checksum[k]); | |
93 | this->lsh_bin.checksum[k] = b_mapping(this->lsh_bin.checksum[k-1], this->slide_window[j], this->slide_window[j_1], this->lsh_bin.checksum[k]); | |
94 | 94 | } |
95 | 95 | } |
96 | 96 | |
97 | 97 | unsigned char r; |
98 | r = b_mapping(2, slide_window[j], slide_window[j_1], slide_window[j_2]); | |
99 | this->a_bucket[r]++; | |
100 | r = b_mapping(3, slide_window[j], slide_window[j_1], slide_window[j_3]); | |
101 | this->a_bucket[r]++; | |
102 | r = b_mapping(5, slide_window[j], slide_window[j_2], slide_window[j_3]); | |
103 | this->a_bucket[r]++; | |
104 | r = b_mapping(7, slide_window[j], slide_window[j_2], slide_window[j_4]); | |
105 | this->a_bucket[r]++; | |
106 | r = b_mapping(11, slide_window[j], slide_window[j_1], slide_window[j_4]); | |
107 | this->a_bucket[r]++; | |
108 | r = b_mapping(13, slide_window[j], slide_window[j_3], slide_window[j_4]); | |
98 | r = b_mapping(2, this->slide_window[j], this->slide_window[j_1], this->slide_window[j_2]); | |
99 | this->a_bucket[r]++; | |
100 | r = b_mapping(3, this->slide_window[j], this->slide_window[j_1], this->slide_window[j_3]); | |
101 | this->a_bucket[r]++; | |
102 | r = b_mapping(5, this->slide_window[j], this->slide_window[j_2], this->slide_window[j_3]); | |
103 | this->a_bucket[r]++; | |
104 | r = b_mapping(7, this->slide_window[j], this->slide_window[j_2], this->slide_window[j_4]); | |
105 | this->a_bucket[r]++; | |
106 | r = b_mapping(11, this->slide_window[j], this->slide_window[j_1], this->slide_window[j_4]); | |
107 | this->a_bucket[r]++; | |
108 | r = b_mapping(13, this->slide_window[j], this->slide_window[j_3], this->slide_window[j_4]); | |
109 | 109 | this->a_bucket[r]++; |
110 | 110 | |
111 | 111 | } |
116 | 116 | /* to signal the class there is no more data to be added */ |
117 | 117 | void TlshImpl::final() |
118 | 118 | { |
119 | // incoming data must more than or equal to 512 bytes | |
120 | if (this->data_len < 512) { | |
119 | // incoming data must more than or equal to MIN_DATA_LENGTH bytes | |
120 | if (this->data_len < MIN_DATA_LENGTH) { | |
121 | 121 | // this->lsh_code be empty |
122 | 122 | delete [] this->a_bucket; this->a_bucket = NULL; |
123 | 123 | return; |
175 | 175 | { |
176 | 176 | return 1; |
177 | 177 | } |
178 | this->reset(); | |
179 | ||
180 | lsh_bin_struct tmp; | |
181 | from_hex( str, TLSH_STRING_LEN, (unsigned char*)&tmp ); | |
182 | ||
183 | // Reconstruct checksum, Qrations & lvalue | |
184 | for (int k = 0; k < TLSH_CHECKSUM_LEN; k++) { | |
185 | this->lsh_bin.checksum[k] = swap_byte(tmp.checksum[k]); | |
186 | } | |
187 | this->lsh_bin.Lvalue = swap_byte( tmp.Lvalue ); | |
188 | this->lsh_bin.Q.QB = swap_byte(tmp.Q.QB); | |
189 | for( int i=0; i < CODE_SIZE; i++ ){ | |
190 | this->lsh_bin.tmp_code[i] = (tmp.tmp_code[CODE_SIZE-1-i]); | |
191 | } | |
192 | this->lsh_code_valid = true; | |
193 | ||
194 | return 0; | |
178 | ||
179 | this->reset(); | |
180 | ||
181 | lsh_bin_struct tmp; | |
182 | from_hex( str, TLSH_STRING_LEN, (unsigned char*)&tmp ); | |
183 | ||
184 | // Reconstruct checksum, Qrations & lvalue | |
185 | for (int k = 0; k < TLSH_CHECKSUM_LEN; k++) { | |
186 | this->lsh_bin.checksum[k] = swap_byte(tmp.checksum[k]); | |
187 | } | |
188 | this->lsh_bin.Lvalue = swap_byte( tmp.Lvalue ); | |
189 | this->lsh_bin.Q.QB = swap_byte(tmp.Q.QB); | |
190 | for( int i=0; i < CODE_SIZE; i++ ){ | |
191 | this->lsh_bin.tmp_code[i] = (tmp.tmp_code[CODE_SIZE-1-i]); | |
192 | } | |
193 | this->lsh_code_valid = true; | |
194 | ||
195 | return 0; | |
195 | 196 | } |
196 | 197 | |
197 | 198 | const char* TlshImpl::hash(char *buffer, unsigned int bufSize) |
275 | 276 | |
276 | 277 | diff += h_distance( CODE_SIZE, this->lsh_bin.tmp_code, other.lsh_bin.tmp_code ); |
277 | 278 | |
278 | return (diff - 1); | |
279 | return (diff); | |
279 | 280 | } |
280 | 281 | |
281 | 282 |
39 | 39 | 51, 65, 28, 144, 254, 221, 93, 189, 194, 139, 112, 43, 71, 109, 184, 209 |
40 | 40 | }; |
41 | 41 | |
42 | // Compile and run gen_arr2.cpp to generate bit_pairs_diff_table | |
42 | 43 | static unsigned char bit_pairs_diff_table[][256] = { |
43 | 44 | { |
44 | 45 | 0, 1, 2, 6, 1, 2, 3, 7, 2, 3, 4, 8, 6, 7, 8, 12, |
21 | 21 | set_target_properties(simple_unittest PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${CMAKE_SOURCE_DIR}/test) |
22 | 22 | set_target_properties(simple_unittest PROPERTIES OUTPUT_NAME simple_unittest${BUILD_POSTFIX}) |
23 | 23 | |
24 | if(CMAKE_COMPILER_IS_GNUCXX) | |
25 | add_executable(tlsh_unittest tlsh_unittest.cpp) | |
26 | target_link_libraries(tlsh_unittest tlsh) | |
27 | set_target_properties(tlsh_unittest PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${CMAKE_SOURCE_DIR}/bin) | |
28 | set_target_properties(tlsh_unittest PROPERTIES OUTPUT_NAME tlsh_unittest${BUILD_POSTFIX}) | |
29 | endif() | |
24 | add_executable(tlsh_unittest tlsh_unittest.cpp) | |
25 | target_link_libraries(tlsh_unittest tlsh) | |
26 | set_target_properties(tlsh_unittest PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${CMAKE_SOURCE_DIR}/bin) | |
27 | set_target_properties(tlsh_unittest PROPERTIES OUTPUT_NAME tlsh_unittest${BUILD_POSTFIX}) |
23 | 23 | #include <stdio.h> |
24 | 24 | #include <stdlib.h> |
25 | 25 | #include <string.h> |
26 | #include <assert.h> | |
26 | 27 | |
27 | 28 | #include "tlsh.h" |
28 | 29 | |
61 | 62 | printf("difference (same strings) = %d\n", t1.totalDiff(&t1) ); |
62 | 63 | printf("difference (with len) = %d\n", t1.totalDiff(&t2) ); |
63 | 64 | printf("difference (without len) = %d\n", t1.totalDiff(&t2, false) ); |
65 | ||
66 | printf("Testing Tlsh with multiple update calls\n"); | |
67 | Tlsh t3, t4; | |
68 | snprintf(minSizeBuffer1, sizeof(minSizeBuffer1), "%s", str1); | |
69 | t3.update( (const unsigned char*) minSizeBuffer1, len1); | |
70 | for (int i = 0; i < 511; i++) { | |
71 | minSizeBuffer1[i] = i % 26 + 'A'; | |
72 | } | |
73 | minSizeBuffer1[511] = 0; | |
74 | t3.update( (const unsigned char*) minSizeBuffer1+len1, 512-len1); | |
75 | t3.final(); | |
76 | assert(strcmp(t1.getHash(), t3.getHash()) == 0); | |
77 | ||
78 | snprintf(minSizeBuffer2, sizeof(minSizeBuffer2), "%s", str2); | |
79 | t4.update( (const unsigned char*) minSizeBuffer2, len2); | |
80 | for (int i = 0; i < 1023; i++) { | |
81 | minSizeBuffer2[i] = i % 26 + 'A'; | |
82 | } | |
83 | minSizeBuffer1[1023] = 0; | |
84 | t4.final( (const unsigned char*) minSizeBuffer2+len2, 1024-len2); | |
85 | assert(strcmp(t2.getHash(), t4.getHash()) == 0); | |
86 | ||
87 | printf("hash3 = %s\n", t3.getHash() ); | |
88 | printf("hash4 = %s\n", t4.getHash() ); | |
89 | ||
90 | printf("Testing Tlsh.fromTlshStr()\n"); | |
91 | printf("Recreating tlsh3 from %s\n", t1.getHash(minSizeBuffer1, sizeof(minSizeBuffer1))); | |
92 | t3.reset(); | |
93 | t3.fromTlshStr(minSizeBuffer1); | |
94 | printf("hash3 = %s\n", t3.getHash(minSizeBuffer2, sizeof(minSizeBuffer2))); | |
95 | assert(strcmp(minSizeBuffer1, minSizeBuffer2) == 0); | |
96 | ||
97 | printf("Recreating tlsh4 from %s\n", t2.getHash(minSizeBuffer1, sizeof(minSizeBuffer1))); | |
98 | t4.reset(); | |
99 | t4.fromTlshStr(minSizeBuffer1); | |
100 | printf("hash4 = %s\n", t4.getHash(minSizeBuffer2, sizeof(minSizeBuffer2))); | |
101 | assert(strcmp(minSizeBuffer1, minSizeBuffer2) == 0); | |
102 | printf("difference (same strings) = %d\n", t3.totalDiff(&t3) ); | |
103 | printf("difference (with len) = %d\n", t3.totalDiff(&t4) ); | |
104 | printf("difference (without len) = %d\n", t3.totalDiff(&t4, false) ); | |
64 | 105 | } |
57 | 57 | |
58 | 58 | fclose(fd); |
59 | 59 | |
60 | if (sizefile < 512) | |
60 | if (sizefile < MIN_DATA_LENGTH) | |
61 | 61 | return(WARNING_FILE_TOO_SMALL); |
62 | 62 | |
63 | 63 | /////////////////////////////////////// |
145 | 145 | int n_file = 0; |
146 | 146 | while (dit != NULL) { |
147 | 147 | char tmp_fname[2000]; |
148 | strncpy(tmp_fname, dirname, sizeof(tmp_fname)); | |
149 | strncat(tmp_fname, "/", sizeof(tmp_fname)); | |
150 | strncat(tmp_fname, dit->d_name, sizeof(tmp_fname)); | |
151 | if (strlen(tmp_fname) < sizeof(tmp_fname) - 2) { | |
148 | int len = snprintf(tmp_fname, sizeof(tmp_fname)-1, "%s/%s", dirname, dit->d_name); | |
149 | if (len < sizeof(tmp_fname) - 2) { | |
152 | 150 | if (is_dir(tmp_fname) ) { |
153 | 151 | if ((strcmp(dit->d_name, ".") == 0) || (strcmp(dit->d_name, "..") == 0)) { |
154 | 152 | ; |
179 | 177 | dit = readdir(dip); |
180 | 178 | while (dit != NULL) { |
181 | 179 | char tmp_fname[2000]; |
182 | strncpy(tmp_fname, dirname, sizeof(tmp_fname)); | |
183 | strncat(tmp_fname, "/", sizeof(tmp_fname)); | |
184 | strncat(tmp_fname, dit->d_name, sizeof(tmp_fname)); | |
180 | int len = snprintf(tmp_fname, sizeof(tmp_fname)-1, "%s/%s", dirname, dit->d_name); | |
185 | 181 | // -2 for safety |
186 | if (strlen(tmp_fname) < sizeof(tmp_fname) - 2) { | |
182 | if (len < sizeof(tmp_fname) - 2) { | |
187 | 183 | if (is_dir(tmp_fname) ) { |
188 | 184 | if ((strcmp(dit->d_name, ".") == 0) || (strcmp(dit->d_name, "..") == 0)) { |
189 | 185 | ; |