CharsRefIntHashMap is not faster than HashMap & lt; String, Integer & gt;, string to integer

Source: Internet
Author: User
Tags rehash

CharsRefIntHashMap is not faster than HashMap <String, Integer>.

I copied BytesRef of lucene and wrote a CharsRefIntHashMap, which is not as effective as HashMap <String, Integer>. The Code is as follows:

Package com. dp. arts. cmdex. utils;


Import org. apache. lucene. util. CharsRef;


Public interface CharsRefIntMap

{

Public staticabstract class CharsRefIntEntryAccessor {

Public synchronized actvoid access (char [] arr, int offset, int length, int value );

}

Public void incKey (CharsRef key );

Public void incKey (CharsRef key, int add );

Public void incKey (char [] arr, int offset, int length );

Public void incKey (char [] arr, int offset, int length, int add );

Public int get (CharsRef key );

Public int get (CharsRef key, int no_entry_value );

Public int get (char [] arr, int offset, int length );

Public int get (char [] arr, int offset, int length, int no_entry_value );

Public int size ();

Public void forEach (CharsRefIntEntryAccessor accesor );

}


Package com. dp. arts. cmdex. utils;


Import java. util. Arrays;


Import org. apache. lucene. util. CharsRef;


Import com. dp. arts. javasex. utils. CharsRefIntMap. CharsRefIntEntryAccessor;


Public class CharsRefIntHashMap implements CharsRefIntMap
{
Public static final int DEFAULT_CAPACITY = 16;

Private char [] [] arrs;
Private int [] offsets;
Private int [] lengths;
Private int [] ords;
Private int [] values;

Private int hashSize;
Private int halfHashSize;
Private int hashMask;
Private int count;

Public CharsRefIntHashMap (){
This (DEFAULT_CAPACITY );
}

Public CharsRefIntHashMap (int capacity ){
Assert capacity> 0 & (capacity-1) = 0 );

Arrs = new char [capacity] [];
Offsets = new int [capacity];
Lengths = new int [capacity];
Ords = new int [capacity];
Values = new int [capacity];

Arrays. fill (ords,-1 );
HashSize = capacity;
HalfHashSize = (capacity >>> 1 );
HashMask = capacity-1;
}


@ Override
Public void incKey (CharsRef key ){
Int code = charsHashCode (key. chars, key. offset, key. length );
IncKey (key. chars, key. offset, key. length, code, 1 );
}


@ Override
Public void incKey (CharsRef key, int add ){
Int code = charsHashCode (key. chars, key. offset, key. length );
IncKey (key. chars, key. offset, key. length, code, add );
}


@ Override
Public void incKey (char [] arr, int offset, int length ){
Int code = charsHashCode (arr, offset, length );
IncKey (arr, offset, length, code, 1 );
}

@ Override
Public void incKey (char [] arr, int offset, int length, int add ){
Int code = charsHashCode (arr, offset, length );
IncKey (arr, offset, length, code, add );
}

Private void incKey (char [] arr, int offset, int length, int code, int add ){
Int pos = (code & hashMask );
Int e = ords [pos];
While (e! =-1 &&! CharsEquals (arrs [e], offsets [e], lengths [e], arr, offset, length )){
Final int inc = (code> 8) + code) | 1;
Code + = inc;
Pos = (code & hashMask );
E = ords [pos];
}
If (e =-1 ){
// New entry.
Arrs [count] = arr;
Offsets [count] = offset;
Lengths [count] = length;
Values [count] = add;
Ords [pos] = count;
++ Count;
If (count = halfHashSize ){
Rehash (hashSize <1 ));
}
} Else {
Values [e] + = add;
}
}

Private void rehash (int newSize ){
Char [] [] newArrs = new char [newSize] [];
Int [] newOffsets = new int [newSize];
Int [] newLengths = new int [newSize];
Int [] newValues = new int [newSize];
System. arraycopy (arrs, 0, newArrs, 0, halfHashSize );
System. arraycopy (offsets, 0, newOffsets, 0, halfHashSize );
System. arraycopy (lengths, 0, newLengths, 0, halfHashSize );
System. arraycopy (values, 0, newValues, 0, halfHashSize );


Final int [] newOrds = new int [newSize];
Arrays. fill (newOrds,-1 );
Final int newHashMask = newSize-1;
For (int I = 0; I Int e0 = ords [I];
If (e0! =-1 ){
Char [] arr = newArrs [e0];
Int offset = newOffsets [e0];
Int length = newLengths [e0];
Int code = charsHashCode (arr, offset, length );
Int pos = code & newHashMask;
While (newOrds [pos]! =-1 ){
Final int inc = (code> 8) + code) | 1;
Code + = inc;
Pos = code & newHashMask;
}
NewOrds [pos] = e0;
}
}

Ords = newOrds;
Arrs = newArrs;
Offsets = newOffsets;
Lengths = newLengths;
Values = newValues;

HashSize = newSize;
HalfHashSize = (newSize> 1 );
HashMask = newHashMask;
}

Public int charsHashCode (char [] chars, int offset, int length ){
Final int prime = 31;
Int result = 0;
Final int end = offset + length;
For (int I = offset; I <end; I ++ ){
Result = prime * result + chars [I];
}
Return result;
}

Public boolean charsEquals (char [] lhsArr, int lhsOffset, int lhsLength, char [] rhsArr, int rhsOffset, int rhsLength ){
If (lhsLength = rhsLength ){
Int otherUpto = rhsOffset;
Final int end = lhsOffset + lhsLength;
For (int upto = lhsOffset; upto <end; upto ++, otherUpto ++ ){
If (lhsArr [upto]! = RhsArr [otherUpto]) {
Return false;
}
}
Return true;
} Else {
Return false;
}
}

@ Override
Public int get (CharsRef key ){
Return get (key. chars, key. offset, key. length, 0 );
}

@ Override
Public int get (CharsRef key, int no_entry_key ){
Return get (key. chars, key. offset, key. length, no_entry_key );
}

@ Override
Public int get (char [] arr, int offset, int length ){
Return get (arr, offset, length, 0 );
}

@ Override
Public int get (char [] arr, int offset, int length, int no_entry_key ){
Int code = charsHashCode (arr, offset, length );
Int pos = (code & hashMask );
Int e = ords [pos];
While (e! =-1 &&! CharsEquals (arrs [e], offsets [e], lengths [e], arr, offset, length )){
Final int inc = (code> 8) + code) | 1;
Code + = inc;
Pos = (code & hashMask );
E = ords [pos];
}
Return e =-1? No_entry_key: values [e];
}

@ Override
Public void forEach (CharsRefIntEntryAccessor accessor ){
For (int I = 0; I Int pos = ords [I];
If (pos! =-1 ){
Accessor. access (arrs [pos], offsets [pos], lengths [pos], values [pos]);
}
}
}


@ Override
Public int size (){
Return count;
}

// For test only.
Public int hashSize (){
Return hashSize;
}
}

Package com. dp. arts. cmdex. utils;


Import java. util. HashMap;

Import java. util. Random;


Import org. apache. lucene. util. CharsRef;


Public class CharsRefIntHashMapBenchmark

{

Private static RandomrandGen = null;

Private static char [] numbersAndLetters = null;


Static {

RandGen = new Random ();

NumbersAndLetters = ("0123456789 abcdefghijklmnopqrstuvwxyz" +

"0123456789 ABCDEFGHIJKLMNOPQRSTUVWXYZ"). toCharArray ();

}


Private staticfinal String randomString (int length ){

If (length <1 ){

Return null;

}

Char [] randBuffer = new char [length];

For (int I = 0; I <randBuffer. length; I ++ ){

RandBuffer [I] = numbersAndLetters [randGen. nextInt (71)];

}

Return new String (randBuffer );

}

Public staticvoid main (String [] args ){

Final int MAX = 100000;

String [] strs = new String [10000];

Int [] values = new int [MAX];

For (int I = 0; I <10000; ++ I ){

Strs [I] = randomString (randGen. nextInt (10) + 1 );

}

For (int I = 0; I <MAX; ++ I ){

Values [I] = randGen. nextInt (10000 );

}

Char [] [] arrs = new char [MAX] [];

Int offsets [] = new int [MAX];

Int counts [] = new int [MAX];

For (int I = 0; I <MAX; ++ I ){

String s = strs [values [I];

Arrs [I] = StringMisc. toCharArray (s );

Offsets [I] = StringMisc. getOffset (s );

Counts [I] = StringMisc. getCount (s );

}

Long start = System. currentTimeMillis ();

CharsRefIntHashMap map = new CharsRefIntHashMap ();

For (int j = 0; j <100; ++ j ){

For (int I = 0; I <MAX; ++ I ){

Map. incKey (arrs [I], offsets [I], counts [I]);

}}

System. err. println ("CharsRefIntHashMap time elapsed:" + (System. currentTimeMillis ()-start) + "ms .");


Start = System. currentTimeMillis ();

HashMap <String, Integer> oldMap = new HashMap <String, Integer> ();

For (int j = 0; j <100; ++ j ){

For (int I = 0; I <MAX; ++ I ){

String s = strs [values [I];

Integer v = oldMap. get (s );

If (v = null ){

V = new Integer (1 );

OldMap. put (s, v );

} Else {

V + = 1;

}

}}

System. err. println ("Origin string map time elapsed:" + (System. currentTimeMillis ()-start) + "ms .");



}

}


Therefore, the write advantage is that the memory usage is less, and the performance should be worse. During rehash, more data needs to be copied, and subscripts are required for each data access. In practice, CharsRef requires 24 bytes of memory. If trove's TObjectIntHashMap is used, the insert speed is equivalent, and the query speed is three times that of jdk hashmap.

Contact Us

The content source of this page is from Internet, which doesn't represent Alibaba Cloud's opinion; products and services mentioned on that page don't have any relationship with Alibaba Cloud. If the content of the page makes you feel confusing, please write us an email, we will handle the problem within 5 days after receiving your email.

If you find any instances of plagiarism from the community, please send an email to: info-contact@alibabacloud.com and provide relevant evidence. A staff member will contact you within 5 working days.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.