http://blog.csdn.net/zzran/article/details/8439367
2012
这个算法很霸气啊,用了100k的单词,居然在15ms就解决了。
考虑了一下,还是决定把思路写出来吧,题目要求,给定一定量大的单词,比如说1000万个,然后找出最热门的前10,也就是出现频数排名前十的单词。思路如下:
先统计出每个单词出现的次数,应用hash统计,这个方法很快。然后建立一个大小为10的小根堆,之后依次从文件中取出单词,并用单词的出现的次数和小根堆的堆顶元素的出现此处进行比较,如果大于堆顶元素出现的次数,则替换,然后调整小根堆。
- #include<iostream>
- #include<string>
- using namespace std;
- #define HASHLEN 2807303
- #define WORDLEN 30
-
- typedef struct node_has_space{
- char word[WORDLEN];
- int count;
- struct node_has_space *next;
- }node_has_space, *p_node_has_space;
-
- typedef struct node_no_space{
- char *word;
- int count;
- struct node_no_space *next;
- }node_no_space, *p_node_no_space;
-
- p_node_no_space bin[HASHLEN] = {NULL};
-
- void swap(int &a, int &b) {
- int temp;
- temp = a;
- a = b;
- b = temp;
- }
- unsigned int hash(char *p_word) {
- unsigned int index = 0;
- while(*p_word) {
- index += index * 31 + *p_word;
- p_word++;
- }
- return index % HASHLEN;
- }
-
- void trim_word(char *word) {
- int n = strlen(word) - 1;
- if(n <= 0)
- return;
- int i = 0;
- while(word[n] < '0' || (word[n] > '9' && word[n] < 'A') || (word[n] > 'Z' && word[n] < 'a') || word[n] > 'z') {
- word[n] = '\0';
- --n;
- }
-
- while(word[i] < '0' || (word[i] > '9' && word[i] < 'A') || (word[i] > 'Z' && word[i] < 'a') || word[i] > 'z') {
- ++i;
- }
- strcpy(word, word + i);
- }
-
- void insert_word(char *word) {
- unsigned int index = hash(word);
- node_no_space *p = bin[index];
- while(p) {
- if(strcmp(p->word, word) == 0) {
- (p->count)++;
- return;
- }
- p = p->next;
- }
-
- p = (node_no_space*)malloc(sizeof(node_no_space));
- p->count = 1;
- p->word = (char*)malloc(strlen(word) + 1);
- strcpy(p->word, word);
- p->next = bin[index];
- bin[index] = p;
- }
-
- void write_to_file(char *file_path) {
- FILE* fout = fopen(file_path, "w");
- int i = 0;
- node_no_space *p;
- while(i < HASHLEN) {
- for(p = bin[i]; p != NULL; p = p->next) {
- fprintf(fout, "%s %d\n", p->word, p->count);
- }
- i++;
- }
- fclose(fout);
- }
-
- void min_heap(node_has_space heap[], int i, int len) {
- int left = i * 2;
- int right = i * 2 + 1;
- int min_index;
-
- if(left <= len && heap[left].count < heap[i].count) {
- min_index = left;
- } else {
- min_index = i;
- }
-
- if(right <= len && heap[right].count < heap[min_index].count) {
- min_index = right;
- }
- if(min_index != i) {
- swap(heap[i].count, heap[min_index].count);
- char buffer[WORDLEN];
- strcpy(buffer, heap[min_index].word);
- strcpy(heap[min_index].word, heap[i].word);
- strcpy(heap[i].word, buffer);
- min_heap(heap, min_index, len);
- }
- }
-
- void build_min_heap(node_has_space heap[], int n) {
- int index = n / 2;
- int i;
- for(i = index; i >= 1; i--) {
- min_heap(heap, i, n);
- }
- }
-
- void main() {
- int i;
- int _count;
- int n = 10;
- FILE *f_message, *fin;
- char *_word = (char*)malloc(WORDLEN);
- f_message = fopen("string.txt", "r");
- if(!f_message)
- return;
- while(fscanf(f_message, "%s", _word) != EOF) {
- if(strlen(_word)) {
- trim_word(_word);
- insert_word(_word);
- }
- }
- fclose(f_message);
-
- write_to_file("result.txt");
-
- fin = fopen("result.txt", "r");
- node_has_space *heap = (node_has_space*) malloc(sizeof(node_has_space) * (n + 1));
- for(i = 1; i <= n; i++) {
- fscanf(fin, "%s %d", _word, &_count);
- heap[i].count = _count;
- strcpy(heap[i].word, _word);
- }
- build_min_heap(heap, n);
- while(fscanf(fin,"%s %d", _word, &_count) != EOF) {
- if(_count > heap[1].count) {
- heap[1].count = _count;
- strcpy(heap[1].word, _word);
- min_heap(heap, 1, n);
- }
- }
-
- for(int k = 1; k <= n; k++) {
- cout << heap[k].word << ":" << heap[k].count << endl;
- }
- }
本站仅提供存储服务,所有内容均由用户发布,如发现有害或侵权内容,请
点击举报。