0.7 source: Developer discussion

GPUs and hashes to the max
http://www.cryptohaze.com/forum/
/*
* Find the largest bitmap that fits in memory then the largest hash table that fits in memory.
* Minimum size for both the bitmap and hash table is 64 MiB.
* Which is OK since I'm pretty sure the least amount of memory any CUDA enabled card has is 256 MiB.
* Maximum size for the bitmap is 4 GiB and 16 GiB for the hash table.
* This function should be changed when cards have over 8 GiB. Also change the way to do a look up in the bitmap and hash table.
*
* [in] totalRam is the total amount of video ram in MiB
* [in] reserveRam is the total of video ram in MiB to not use (should be around 16 MiB to be safe)
* [in] numHashes is the total number of hashes trying to crack
* [out] deviceBitmapBits is the number of bits for the bitmap on the device
* [out] deviceHashTableBits is the number of bits for the hash table on the device
* returns true on success
*/
bool calcBitmapAndHashTableBits(uint32_t totalRam, uint32_t reserveRam, uint32_t numHashes, uint32_t &deviceBitmapBits, uint32_t &deviceHashTableBits)
{
const uint32_t HASH_SIZE = 16 + 16 + 4; // hash size + password size + hash index
uint32_t ramToUse, ramUsed, hashRam;
hashRam = ((HASH_SIZE * (numHashes >> 8)) >> 12) + 1;
ramToUse = totalRam - reserveRam - hashRam - 64; // 64 MiB is for the hash table
ramUsed = 64; // use a minimum of 64 MiB for the bitmap on the device
deviceBitmapBits = 29;
do
{
ramUsed <<= 1;
deviceBitmapBits++;
} while (ramUsed <= ramToUse && deviceBitmapBits < 35);
ramUsed >>= 1;
deviceBitmapBits--;
ramToUse = totalRam - reserveRam - hashRam - ramUsed;
ramUsed = 64; // use a minimum of 64 MiB for the hash table on the device
deviceHashTableBits = 24;
do
{
ramUsed <<= 1;
deviceHashTableBits++;
} while (ramUsed <= ramToUse && deviceHashTableBits < 32);
deviceHashTableBits--;
ramUsed = reserveRam + hashRam + (1 << (deviceBitmapBits - 23)) + (1 << (deviceHashTableBits - 18));
return ramUsed <= totalRam;
}
__device__ inline void checkHash(...) // :)
{
bool goodToGo = false;
if ((s_bitmap[a >> 20] & (1 << (a & 0x7))) && // grab the highest 12 bits from a and grab the lowest 3 bits from a
(d_bitmap[b >> (35 - deviceBitmapBits)] & (1 << (c >> 29)))) // grab the highest deviceBitmapBits - 3 bits from b and grab the highest 3 bits from c
{
uint32_t x = c & ((1 << deviceHashTableBits) - 1); // grab the lowest deviceHashTableBits bits from c
uint32_t start = d_chainedHashTable[x];
uint32_t end = d_chainedHashTable[x+1];
uint64_t ad = (((uint64_t)a) << 32) | d;
for (x = start; x < end; x++)
{
if (d_hashes[2*x] == ad)
{
// I never decided whether to:
// stop here at 88 to 91 bits matching and report partial match somehow (and have the CPU check the rest)
// or continue and check the full hash
goodToGo = true;
break;
}
}
}
__syncthreads();
if (goodToGo)
{
uint64_t bc = (((uint64_t)b) << 32) | c;
do
{
if (d_hashes[2*x+1] == bc)
{
// Report successful
// Atomic functions require at least compute capability 1.1 :(
uint32_t y;
if (compute_capability_1_0) // :)
{
y = x;
d_crackedHash[y] = 1;
}
else
{
y = atomicAdd(d_crackedCount, 1);
d_crackedHashIndex[y] = x;
}
d_crackedPw[y] = curPw; // :)
break;
}
x++;
} while (d_hashes[2*x] == ad);
}
__syncthreads();
}
blendaperry wrote:These all steps for source developer that can having the bitmap shared memory. For these the kernal is the best solution for the large hash space it can be eliminate the sharped bitmap entirely.
Bitweasil wrote:Have 3 kernels for each hash type - FAST (for very small hash lists), MEDIUM (shared bitmap, no big global bitmap), and LARGE (no shared bitmap, large global bitmap) - and then benchmark before running to find out what the fastest option for the given parameters is.