Given a book of words. Assume you have enough main memory to accommodate all words. design a data structure to find top K maximum occurring words. The data structure should be dynamic so that new words can be added.
We can use Trie and Min Heap to get the k most frequent words efficiently. The idea is to use Trie for searching existing words adding new words efficiently. Trie also stores count of occurrences of words. A Min Heap of size k is used to keep track of k most frequent words at any point of time(Use of Min Heap is same as we used it to find k largest elements in this post).
Trie and Min Heap are linked with each other by storing an additional field in Trie ‘indexMinHeap’ and a pointer ‘trNode’ in Min Heap. The value of ‘indexMinHeap’ is maintained as -1 for the words which are currently not in Min Heap (or currently not among the top k frequent words). For the words which are present in Min Heap, ‘indexMinHeap’ contains, index of the word in Min Heap. The pointer ‘trNode’ in Min Heap points to the leaf node corresponding to the word in Trie.
Read full article from Find the k most frequent words from a file | GeeksforGeeks
We can use Trie and Min Heap to get the k most frequent words efficiently. The idea is to use Trie for searching existing words adding new words efficiently. Trie also stores count of occurrences of words. A Min Heap of size k is used to keep track of k most frequent words at any point of time(Use of Min Heap is same as we used it to find k largest elements in this post).
Trie and Min Heap are linked with each other by storing an additional field in Trie ‘indexMinHeap’ and a pointer ‘trNode’ in Min Heap. The value of ‘indexMinHeap’ is maintained as -1 for the words which are currently not in Min Heap (or currently not among the top k frequent words). For the words which are present in Min Heap, ‘indexMinHeap’ contains, index of the word in Min Heap. The pointer ‘trNode’ in Min Heap points to the leaf node corresponding to the word in Trie.
struct
TrieNode
{
bool
isEnd;
// indicates end of word
unsigned frequency;
// the number of occurrences of a word
int
indexMinHeap;
// the index of the word in minHeap
TrieNode* child[MAX_CHARS];
// represents 26 slots each for 'a' to 'z'.
};
// A Min Heap node
struct
MinHeapNode
{
TrieNode* root;
// indicates the leaf node of TRIE
unsigned frequency;
// number of occurrences
char
* word;
// the actual word stored
};
// A Min Heap
struct
MinHeap
{
unsigned capacity;
// the total size a min heap
int
count;
// indicates the number of slots filled.
MinHeapNode* array;
// represents the collection of minHeapNodes
};
void
printKMostFreq(
FILE
* fp,
int
k )
{
// Create a Min Heap of Size k
MinHeap* minHeap = createMinHeap( k );
// Create an empty Trie
TrieNode* root = NULL;
// A buffer to store one word at a time
char
buffer[MAX_WORD_SIZE];
// Read words one by one from file. Insert the word in Trie and Min Heap
while
(
fscanf
( fp,
"%s"
, buffer ) != EOF )
insertTrieAndHeap(buffer, &root, minHeap);
// The Min Heap will have the k most frequent words, so print Min Heap nodes
displayMinHeap( minHeap );
}
// Inserts a new word to both Trie and Heap
void
insertUtil ( TrieNode** root, MinHeap* minHeap,
const
char
* word,
const
char
* dupWord )
{
// Base Case
if
( *root == NULL )
*root = newTrieNode();
// There are still more characters in word
if
( *word !=
'\0'
)
insertUtil ( &((*root)->child[
tolower
( *word ) - 97 ]),
minHeap, word + 1, dupWord );
else
// The complete word is processed
{
// word is already present, increase the frequency
if
( (*root)->isEnd )
++( (*root)->frequency );
else
{
(*root)->isEnd = 1;
(*root)->frequency = 1;
}
// Insert in min heap also
insertInMinHeap( minHeap, root, dupWord );
}
}
// add a word to Trie & min heap. A wrapper over the insertUtil
void
insertTrieAndHeap(
const
char
*word, TrieNode** root, MinHeap* minHeap)
{
insertUtil( root, minHeap, word, word );
}
// Inserts a word to heap, the function handles the 3 cases explained above
void
insertInMinHeap( MinHeap* minHeap, TrieNode** root,
const
char
* word )
{
// Case 1: the word is already present in minHeap
if
( (*root)->indexMinHeap != -1 )
{
++( minHeap->array[ (*root)->indexMinHeap ]. frequency );
// percolate down
minHeapify( minHeap, (*root)->indexMinHeap );
}
// Case 2: Word is not present and heap is not full
else
if
( minHeap->count < minHeap->capacity )
{
int
count = minHeap->count;
minHeap->array[ count ]. frequency = (*root)->frequency;
minHeap->array[ count ]. word =
new
char
[
strlen
( word ) + 1];
strcpy
( minHeap->array[ count ]. word, word );
minHeap->array[ count ]. root = *root;
(*root)->indexMinHeap = minHeap->count;
++( minHeap->count );
buildMinHeap( minHeap );
}
// Case 3: Word is not present and heap is full. And frequency of word
// is more than root. The root is the least frequent word in heap,
// replace root with new word
else
if
( (*root)->frequency > minHeap->array[0]. frequency )
{
minHeap->array[ 0 ]. root->indexMinHeap = -1;
minHeap->array[ 0 ]. root = *root;
minHeap->array[ 0 ]. root->indexMinHeap = 0;
minHeap->array[ 0 ]. frequency = (*root)->frequency;
// delete previously allocated memoory and
delete
[] minHeap->array[ 0 ]. word;
minHeap->array[ 0 ]. word =
new
char
[
strlen
( word ) + 1];
strcpy
( minHeap->array[ 0 ]. word, word );
minHeapify ( minHeap, 0 );
}
}
No comments:
Post a Comment