| #include <stdio.h> |
| #include <stdlib.h> |
| #include <string.h> |
| #include <getopt.h> |
| |
| #define PACKAGE "wgram" |
| #define VERSION "0.0.4" |
| #define MAXLINE 1024 |
| #define MAXGRAM 32 |
| |
| /* status epilepticus .. print help */ |
| void print_help(int exval); |
| |
| int main (int argc, char *argv[]) { |
| /* word delimeter for strtok() */ |
| char delim[] = ".,:;`/\"+-_(){}[]<>*&^%$#@!?~/|\\=1234567890 \t\n"; |
| char line[MAXLINE]; /* input buff, fgets() */ |
| char *stray = NULL; /* returned value by strtok() */ |
| char **strarray = NULL; /* array to hold all entrys */ |
| int i = 0; /* general counter */ |
| int strcount = 0; /* number of entrys in pointer array */ |
| int N = 3, pos = 0; /* ngram size, 3 in this case */ |
| int opt = 0; /* holds command line opt nr.. */ |
| int word_flag = 0; /* print only the `raw' words */ |
| FILE *fp = stdin; /* read input from `FILE', default is stdin */ |
| |
| while((opt = getopt(argc, argv, "hvn:wf:")) != -1) { |
| switch(opt) { |
| case 'h': |
| print_help(0); |
| break; |
| case 'v': |
| exit(0); |
| break; |
| case 'n': |
| N = atoi(optarg); |
| if(N > MAXGRAM || N < 2) { |
| fprintf(stderr, "%s: Error - Ngram length `%d' out of range `0-%d'\n", |
| PACKAGE, N, MAXGRAM); |
| return 1; |
| } |
| break; |
| case 'w': |
| word_flag = 1; |
| break; |
| case 'f': |
| if(freopen(optarg, "r", fp) == NULL) { |
| fprintf(stderr, "%s: Error - opening `%s'\n", PACKAGE, optarg); |
| return 1; |
| } |
| break; |
| case '?': |
| fprintf(stderr, "%s: Error - No such option: `%c'\n\n", PACKAGE, optopt); |
| print_help(1); |
| } /* switch */ |
| } /* while */ |
| |
| /* start reading lines from file pointer, add all entrys to **strarray */ |
| while((fgets(line, MAXLINE, fp)) != NULL) { |
| if(strlen(line) < 2) |
| continue; |
| |
| stray = strtok(line, delim); |
| while(stray != NULL) { |
| strarray = (char **)realloc(strarray, (strcount + 1) * sizeof(char *)); |
| strarray[strcount++] = strdup(stray); |
| stray = strtok(NULL, delim); |
| } |
| } |
| |
| if(word_flag == 0) { |
| /* |
| // print the array of strings, jumping back each time |
| // (N - 1) positions if a whole ngram of words has been printed |
| */ |
| for(i = 0, pos = N; i < strcount; i++, pos--) { |
| if(pos == 0) pos = N, i -= (N - 1), printf("\n"); |
| printf("%s ", strarray[i]); |
| } |
| printf("\n"); |
| } else { |
| /* print raw words */ |
| for(i = 0; i < strcount; i++) |
| printf("%s\n", strarray[i]); |
| } |
| |
| /* free the string array */ |
| for(i = 0; i < strcount; i++) |
| free(strarray[i]); |
| |
| free(strarray); |
| return 0; |
| } |
| |
| /* status epilepticus .. print help */ |
| void print_help(int exval) { |
| printf("%s,%s extract N-grams from text data\n", PACKAGE, VERSION); |
| printf("Usage: %s [-h] [-v] [-n INT] [-w] [-f FILE]\n\n", PACKAGE); |
| |
| printf(" -h print this help and exit\n"); |
| printf(" -v print version and exit\n\n"); |
| |
| printf(" -n INT set ngram length (default=3)\n"); |
| printf(" -w print only the extracted words\n"); |
| printf(" -f FILE read input from `FILE' (default=stdin)\n\n"); |
| exit(exval); |
| } |