java simhash是什么?讓我們一起來了解一下吧!
java simhash是java程序中的一種算法。Simhash算法產(chǎn)生與2002年,設(shè)計(jì)非常美妙,它輸入是一個(gè)向量,得出的結(jié)果是一個(gè)F位的簽名值。
Simhash和一般的hash算法不同,它具有兩個(gè)關(guān)鍵的特點(diǎn):
1.一個(gè)文檔的指紋是所有屬性的某種hash;
2.相似文檔的hash應(yīng)該是相似的;
?simhash 算法如下:1,將一個(gè) f 維的向量 V 初始化為 0 ; f 位的二進(jìn)制數(shù) S 初始化為 0 ;2,對每一個(gè)特征:用傳統(tǒng)的 hash 算法對該特征產(chǎn)生一個(gè) f 位的簽名 b 。對 i=1 到 f :如果b 的第 i 位為 1 ,則 V 的第 i 個(gè)元素加上該特征的權(quán)重;否則,V 的第 i 個(gè)元素減去該特征的權(quán)重。?3,如果 V 的第 i 個(gè)元素大于 0 ,則 S 的第 i 位為 1 ,否則為 0 ;4,輸出 S 作為簽名。
simhash 算法代碼:
package?com.xxxx.checkandbigdataquery.utils; ? import?it.unimi.dsi.fastutil.longs.LongOpenHashSet; import?it.unimi.dsi.fastutil.longs.LongSet; import?java.io.File; import?java.io.FileInputStream; import?java.io.IOException; import?java.nio.CharBuffer; import?java.util.Set; ? ? /** ?*?a?basic?SimHash?implementation ?* ?* ?*/ public?class?SimHash?{ ??public?static?final?int??HASH_SIZE??????????=?64; ??public?static?final?long?HASH_RANGE?????????=?2?^?HASH_SIZE; ??public?static?MurmurHash?hasher?????????????=?new?MurmurHash(); ? ??/** ???*?use?short?cuts?to?obtains?a?speed?optimized?simhash?calculation ???* ???*?@param?s ???*??????????input?string ???*?@return?64?bit?simhash?of?input?string ???*/ ? ??private?static?final?int?FIXED_CGRAM_LENGTH?=?4; ? ??public?static?long?computeOptimizedSimHashForString(String?s)?{ ????return?computeOptimizedSimHashForString(CharBuffer.wrap(s)); ??} ? ??public?static?long?computeOptimizedSimHashForString(CharBuffer?s)?{ ? ????LongSet?shingles?=?new?LongOpenHashSet(Math.min(s.length(),?100000)); ? ????int?length?=?s.length(); ? ????long?timeStart?=?System.currentTimeMillis(); ????for?(int?i?=?0;?i??56); ??????longAsBytes[1]?=?(byte)?(shingle?>>?48); ??????longAsBytes[2]?=?(byte)?(shingle?>>?40); ??????longAsBytes[3]?=?(byte)?(shingle?>>?32); ??????longAsBytes[4]?=?(byte)?(shingle?>>?24); ??????longAsBytes[5]?=?(byte)?(shingle?>>?16); ??????longAsBytes[6]?=?(byte)?(shingle?>>?8); ??????longAsBytes[7]?=?(byte)?(shingle); ? ??????long?longHash?=?FPGenerator.std64.fp(longAsBytes,?0,?8); ??????for?(int?i?=?0;?i?>?i)?&?1L)?==?1L; ????????v[i]?+=?(bitSet)???1?:?-1; ??????} ????} ? ????long?simhash?=?0; ????for?(int?i?=?0;?i??0)?{ ????????simhash?|=?(1L?<>?i)?&?1L)?==?1L; ????????v[i]?+=?(bitSet)???1?:?-1; ??????} ????} ????long?simhash?=?0; ????for?(int?i?=?0;?i??0)?{ ????????simhash?|=?(1L?<>>?-1); ??} ? ??public?static?void?main(String[]?args)?{ ????try?{ ??????//?File?file1?=?new?File("/Users/rana/academia.edu_01.html"); ??????//?File?file2?=?new?File("/Users/rana/academia.edu_02.html"); ? ??????File?file1?=?new?File(args[0]); ??????File?file2?=?new?File(args[1]); ? ??????byte?data1[]?=?new?byte[(int)?file1.length()]; ??????byte?data2[]?=?new?byte[(int)?file2.length()]; ??????FileInputStream?stream1?=?new?FileInputStream(file1); ??????FileInputStream?stream2?=?new?FileInputStream(file2); ??????stream1.read(data1); ??????stream2.read(data2); ??????String?string1?=?new?String(data1); ??????String?string2?=?new?String(data2); ? ??????long?timeStart?=?System.currentTimeMillis(); ??????long?simhash1?=?computeSimHashFromString(Shingle.shingles(string1)); ??????long?timeEnd?=?System.currentTimeMillis(); ??????System.out.println("Old?Calc?for?Document?A?Took:" ??????????+?(timeEnd?-?timeStart)); ??????timeStart?=?System.currentTimeMillis(); ??????long?simhash2?=?computeSimHashFromString(Shingle.shingles(string2)); ??????timeEnd?=?System.currentTimeMillis(); ??????System.out.println("Old?Calc?for?Document?B?Took:" ??????????+?(timeEnd?-?timeStart)); ??????timeStart?=?System.currentTimeMillis(); ??????long?simhash3?=?computeOptimizedSimHashForString(string1); ??????timeEnd?=?System.currentTimeMillis(); ??????System.out.println("New?Calc?for?Document?A?Took:" ??????????+?(timeEnd?-?timeStart)); ??????timeStart?=?System.currentTimeMillis(); ??????long?simhash4?=?computeOptimizedSimHashForString(string2); ??????timeEnd?=?System.currentTimeMillis(); ??????System.out.println("New?Calc?for?Document?B?Took:" ??????????+?(timeEnd?-?timeStart)); ? ??????int?hammingDistance?=?hammingDistance(simhash1,?simhash2); ??????int?hammingDistance2?=?hammingDistance(simhash3,?simhash4); ? ??????System.out.println("hammingdistance?Doc?(A)?to?Doc(B)?OldWay:" ??????????+?hammingDistance); ??????System.out.println("hammingdistance?Doc?(A)?to?Doc(B)?NewWay:" ??????????+?hammingDistance2); ????}?catch?(IOException?e)?{ ??????e.printStackTrace(); ????} ??} }
以上就是小編今天的分享了,希望可以幫助到大家。