| /* |
| * Copyright 2001-2004 Brandon Long |
| * All Rights Reserved. |
| * |
| * ClearSilver Templating System |
| * |
| * This code is made available under the terms of the ClearSilver License. |
| * http://www.clearsilver.net/license.hdf |
| * |
| */ |
| |
| #include "cs_config.h" |
| |
| #include <stdlib.h> |
| #include <string.h> |
| #include <sys/types.h> |
| #include <regex.h> |
| #include <ctype.h> |
| #include "util/neo_misc.h" |
| #include "util/neo_err.h" |
| #include "util/neo_str.h" |
| #include "html.h" |
| #include "cgi.h" |
| |
| static int has_space_formatting(const char *src, int slen) |
| { |
| int spaces = 0; |
| int returns = 0; |
| int ascii_art = 0; |
| int x = 0; |
| |
| for (x = 0; x < slen; x++) |
| { |
| if (src[x] == '\t') return 1; |
| if (src[x] == ' ') |
| { |
| spaces++; |
| if (x && (src[x-1] == '.')) |
| spaces--; |
| } |
| else if (src[x] == '\n') |
| { |
| spaces = 0; |
| returns++; |
| } |
| else if (strchr ("/\\<>:[]!@#$%^&*()|", src[x])) |
| { |
| ascii_art++; |
| if (ascii_art > 3) return 2; |
| } |
| else if (src[x] != '\r') |
| { |
| if (returns > 2) return 1; |
| if (spaces > 2) return 1; |
| returns = 0; |
| spaces = 0; |
| ascii_art = 0; |
| } |
| } |
| |
| return 0; |
| } |
| |
| /* |
| static int has_long_lines (char *s, int l) |
| { |
| char *ptr; |
| int x = 0; |
| |
| while (x < l) |
| { |
| ptr = strchr (s + x, '\n'); |
| if (ptr == NULL) |
| { |
| if (l - x > 75) return 1; |
| return 0; |
| } |
| if (ptr - (s + x) > 75) return 1; |
| x = ptr - s + 1; |
| } |
| return 0; |
| } |
| */ |
| |
| /* The first step is to actually find all of the URLs and email |
| * addresses using our handy regular expressions. We then mark these, |
| * and then go through convert non-special areas with straight |
| * text->html escapes, and convert special parts as special parts |
| */ |
| struct _parts { |
| int begin; |
| int end; |
| int type; |
| }; |
| |
| #define SC_TYPE_TEXT 1 |
| #define SC_TYPE_URL 2 |
| #define SC_TYPE_EMAIL 3 |
| |
| static char *EmailRe = "[^][@:;<>\\\"()[:space:][:cntrl:]]+@[-+a-zA-Z0-9]+\\.[-+a-zA-Z0-9\\.]+[-+a-zA-Z0-9]"; |
| static char *URLRe = "((http|https|ftp|mailto):(//)?[^[:space:]>\"\t]*|www\\.[-a-z0-9\\.]+)[^[:space:];\t\">]*"; |
| |
| static NEOERR *split_and_convert (const char *src, int slen, |
| STRING *out, HTML_CONVERT_OPTS *opts) |
| { |
| NEOERR *err = STATUS_OK; |
| static int compiled = 0; |
| static regex_t email_re, url_re; |
| regmatch_t email_match, url_match; |
| int errcode; |
| char *ptr, *esc; |
| char errbuf[256]; |
| struct _parts *parts; |
| int part_count; |
| int part; |
| int x, i; |
| int spaces = 0; |
| |
| if (!compiled) |
| { |
| if ((errcode = regcomp (&email_re, EmailRe, REG_ICASE | REG_EXTENDED))) |
| { |
| regerror (errcode, &email_re, errbuf, sizeof(errbuf)); |
| return nerr_raise (NERR_PARSE, "Unable to compile EmailRE: %s", errbuf); |
| } |
| if ((errcode = regcomp (&url_re, URLRe, REG_ICASE | REG_EXTENDED))) |
| { |
| regerror (errcode, &url_re, errbuf, sizeof(errbuf)); |
| return nerr_raise (NERR_PARSE, "Unable to compile URLRe: %s", errbuf); |
| } |
| compiled = 1; |
| } |
| |
| part_count = 20; |
| parts = (struct _parts *) malloc (sizeof(struct _parts) * part_count); |
| part = 0; |
| |
| x = 0; |
| if (regexec (&email_re, src+x, 1, &email_match, 0) != 0) |
| { |
| email_match.rm_so = -1; |
| email_match.rm_eo = -1; |
| } |
| else |
| { |
| email_match.rm_so += x; |
| email_match.rm_eo += x; |
| } |
| if (regexec (&url_re, src+x, 1, &url_match, 0) != 0) |
| { |
| url_match.rm_so = -1; |
| url_match.rm_eo = -1; |
| } |
| else |
| { |
| url_match.rm_so += x; |
| url_match.rm_eo += x; |
| } |
| while ((x < slen) && !((email_match.rm_so == -1) && (url_match.rm_so == -1))) |
| { |
| if (part >= part_count) |
| { |
| part_count *= 2; |
| parts = (struct _parts *) realloc (parts, sizeof(struct _parts) * part_count); |
| } |
| if ((url_match.rm_so != -1) && ((email_match.rm_so == -1) || (url_match.rm_so <= email_match.rm_so))) |
| { |
| parts[part].begin = url_match.rm_so; |
| parts[part].end = url_match.rm_eo; |
| parts[part].type = SC_TYPE_URL; |
| x = parts[part].end + 1; |
| part++; |
| if (x < slen) |
| { |
| if (regexec (&url_re, src+x, 1, &url_match, 0) != 0) |
| { |
| url_match.rm_so = -1; |
| url_match.rm_eo = -1; |
| } |
| else |
| { |
| url_match.rm_so += x; |
| url_match.rm_eo += x; |
| } |
| if ((email_match.rm_so != -1) && (x > email_match.rm_so)) |
| { |
| if (regexec (&email_re, src+x, 1, &email_match, 0) != 0) |
| { |
| email_match.rm_so = -1; |
| email_match.rm_eo = -1; |
| } |
| else |
| { |
| email_match.rm_so += x; |
| email_match.rm_eo += x; |
| } |
| } |
| } |
| } |
| else |
| { |
| parts[part].begin = email_match.rm_so; |
| parts[part].end = email_match.rm_eo; |
| parts[part].type = SC_TYPE_EMAIL; |
| x = parts[part].end + 1; |
| part++; |
| if (x < slen) |
| { |
| if (regexec (&email_re, src+x, 1, &email_match, 0) != 0) |
| { |
| email_match.rm_so = -1; |
| email_match.rm_eo = -1; |
| } |
| else |
| { |
| email_match.rm_so += x; |
| email_match.rm_eo += x; |
| } |
| if ((url_match.rm_so != -1) && (x > url_match.rm_so)) |
| { |
| if (regexec (&url_re, src+x, 1, &url_match, 0) != 0) |
| { |
| url_match.rm_so = -1; |
| url_match.rm_eo = -1; |
| } |
| else |
| { |
| url_match.rm_so += x; |
| url_match.rm_eo += x; |
| } |
| } |
| } |
| } |
| } |
| |
| i = 0; |
| x = 0; |
| while (x < slen) |
| { |
| if ((i >= part) || (x < parts[i].begin)) |
| { |
| ptr = strpbrk(src + x, "&<>\r\n "); |
| if (ptr == NULL) |
| { |
| if (spaces) |
| { |
| int sp; |
| for (sp = 0; sp < spaces - 1; sp++) |
| { |
| err = string_append (out, " "); |
| if (err != STATUS_OK) break; |
| } |
| if (err != STATUS_OK) break; |
| err = string_append_char (out, ' '); |
| } |
| spaces = 0; |
| if (i < part) |
| { |
| err = string_appendn (out, src + x, parts[i].begin - x); |
| x = parts[i].begin; |
| } |
| else |
| { |
| err = string_append (out, src + x); |
| x = slen; |
| } |
| } |
| else |
| { |
| if ((i >= part) || ((ptr - src) < parts[i].begin)) |
| { |
| if (spaces) |
| { |
| int sp; |
| for (sp = 0; sp < spaces - 1; sp++) |
| { |
| err = string_append (out, " "); |
| if (err != STATUS_OK) break; |
| } |
| if (err != STATUS_OK) break; |
| err = string_append_char (out, ' '); |
| } |
| spaces = 0; |
| err = string_appendn (out, src + x, (ptr - src) - x); |
| if (err != STATUS_OK) break; |
| x = ptr - src; |
| if (src[x] == ' ') |
| { |
| if (opts->space_convert) |
| { |
| spaces++; |
| } |
| else |
| err = string_append_char (out, ' '); |
| } |
| else |
| { |
| if (src[x] != '\n' && spaces) |
| { |
| int sp; |
| for (sp = 0; sp < spaces - 1; sp++) |
| { |
| err = string_append (out, " "); |
| if (err != STATUS_OK) break; |
| } |
| if (err != STATUS_OK) break; |
| err = string_append_char (out, ' '); |
| } |
| spaces = 0; |
| |
| if (src[x] == '&') |
| err = string_append (out, "&"); |
| else if (src[x] == '<') |
| err = string_append (out, "<"); |
| else if (src[x] == '>') |
| err = string_append (out, ">"); |
| else if (src[x] == '\n') |
| if (opts->newlines_convert) |
| err = string_append (out, "<br/>\n"); |
| else if (x && src[x-1] == '\n') |
| err = string_append (out, "<p/>\n"); |
| else |
| err = string_append_char (out, '\n'); |
| else if (src[x] != '\r') |
| err = nerr_raise (NERR_ASSERT, "src[x] == '%c'", src[x]); |
| } |
| x++; |
| } |
| else |
| { |
| if (spaces) |
| { |
| int sp; |
| for (sp = 0; sp < spaces - 1; sp++) |
| { |
| err = string_append (out, " "); |
| if (err != STATUS_OK) break; |
| } |
| if (err != STATUS_OK) break; |
| err = string_append_char (out, ' '); |
| } |
| spaces = 0; |
| err = string_appendn (out, src + x, parts[i].begin - x); |
| x = parts[i].begin; |
| } |
| } |
| } |
| else |
| { |
| if (spaces) |
| { |
| int sp; |
| for (sp = 0; sp < spaces - 1; sp++) |
| { |
| err = string_append (out, " "); |
| if (err != STATUS_OK) break; |
| } |
| if (err != STATUS_OK) break; |
| err = string_append_char (out, ' '); |
| } |
| spaces = 0; |
| if (parts[i].type == SC_TYPE_URL) |
| { |
| char last_char = src[parts[i].end-1]; |
| int suffix=0; |
| if (last_char == '.' || last_char == ',') { suffix=1; } |
| err = string_append (out, " <a "); |
| if (err != STATUS_OK) break; |
| if (opts->url_class) |
| { |
| err = string_appendf (out, "class=%s ", opts->url_class); |
| if (err) break; |
| } |
| if (opts->url_target) |
| { |
| err = string_appendf (out, "target=\"%s\" ", opts->url_target); |
| if (err) break; |
| } |
| err = string_append(out, "href=\""); |
| if (err) break; |
| if (opts->bounce_url) |
| { |
| char *url, *esc_url, *new_url; |
| int url_len; |
| if (!strncasecmp(src + x, "www.", 4)) |
| { |
| url_len = 7 + parts[i].end - x - suffix; |
| url = (char *) malloc(url_len+1); |
| if (url == NULL) |
| { |
| err = nerr_raise(NERR_NOMEM, |
| "Unable to allocate memory to convert url"); |
| break; |
| } |
| strcpy(url, "http://"); |
| strncat(url, src + x, parts[i].end - x - suffix); |
| } |
| else |
| { |
| url_len = parts[i].end - x - suffix; |
| url = (char *) malloc(url_len+1); |
| if (url == NULL) |
| { |
| err = nerr_raise(NERR_NOMEM, |
| "Unable to allocate memory to convert url"); |
| break; |
| } |
| strncpy(url, src + x, parts[i].end - x - suffix); |
| url[url_len] = '\0'; |
| } |
| err = cgi_url_escape(url, &esc_url); |
| free(url); |
| if (err) { |
| free(esc_url); |
| break; |
| } |
| |
| new_url = sprintf_alloc(opts->bounce_url, esc_url); |
| free(esc_url); |
| if (new_url == NULL) |
| { |
| err = nerr_raise(NERR_NOMEM, "Unable to allocate memory to convert url"); |
| break; |
| } |
| err = string_append (out, new_url); |
| free(new_url); |
| if (err) break; |
| } |
| else |
| { |
| if (!strncasecmp(src + x, "www.", 4)) |
| { |
| err = string_append (out, "http://"); |
| if (err != STATUS_OK) break; |
| } |
| err = string_appendn (out, src + x, parts[i].end - x - suffix); |
| if (err != STATUS_OK) break; |
| } |
| err = string_append (out, "\">"); |
| if (err != STATUS_OK) break; |
| if (opts->link_name) { |
| err = html_escape_alloc((opts->link_name), |
| strlen(opts->link_name), &esc); |
| } else { |
| err = html_escape_alloc((src + x), parts[i].end - x - suffix, &esc); |
| } |
| if (err != STATUS_OK) break; |
| err = string_append (out, esc); |
| free(esc); |
| if (err != STATUS_OK) break; |
| err = string_append (out, "</a>"); |
| if (suffix) { |
| err = string_appendn(out,src + parts[i].end - 1,1); |
| if (err != STATUS_OK) break; |
| } |
| } |
| else /* type == SC_TYPE_EMAIL */ |
| { |
| err = string_append (out, "<a "); |
| if (err != STATUS_OK) break; |
| if (opts->mailto_class) |
| { |
| err = string_appendf (out, "class=%s ", opts->mailto_class); |
| if (err) break; |
| } |
| err = string_append(out, "href=\"mailto:"); |
| if (err) break; |
| err = string_appendn (out, src + x, parts[i].end - x); |
| if (err != STATUS_OK) break; |
| err = string_append (out, "\">"); |
| if (err != STATUS_OK) break; |
| err = html_escape_alloc(src + x, parts[i].end - x, &esc); |
| if (err != STATUS_OK) break; |
| err = string_append (out, esc); |
| free(esc); |
| if (err != STATUS_OK) break; |
| err = string_append (out, "</a>"); |
| } |
| x = parts[i].end; |
| i++; |
| } |
| if (err != STATUS_OK) break; |
| } |
| free (parts); |
| return err; |
| } |
| |
| static void strip_white_space_end (STRING *str) |
| { |
| int x = 0; |
| int ol = str->len; |
| char *ptr; |
| int i; |
| |
| while (x < str->len) |
| { |
| ptr = strchr(str->buf + x, '\n'); |
| if (ptr == NULL) |
| { |
| /* just strip the white space at the end of the string */ |
| ol = strlen(str->buf); |
| while (ol && isspace(str->buf[ol-1])) |
| { |
| str->buf[ol - 1] = '\0'; |
| ol--; |
| } |
| str->len = ol; |
| return; |
| } |
| else |
| { |
| x = i = ptr - str->buf; |
| if (x) |
| { |
| x--; |
| while (x && isspace(str->buf[x]) && (str->buf[x] != '\n')) x--; |
| if (x) x++; |
| memmove (str->buf + x, ptr, ol - i + 1); |
| x++; |
| str->len -= ((i - x) + 1); |
| str->buf[str->len] = '\0'; |
| ol = str->len; |
| } |
| } |
| } |
| } |
| |
| NEOERR *convert_text_html_alloc (const char *src, int slen, |
| char **out) |
| { |
| return nerr_pass(convert_text_html_alloc_options(src, slen, out, NULL)); |
| } |
| |
| NEOERR *convert_text_html_alloc_options (const char *src, int slen, |
| char **out, |
| HTML_CONVERT_OPTS *opts) |
| { |
| NEOERR *err; |
| STRING out_s; |
| int formatting = 0; |
| HTML_CONVERT_OPTS my_opts; |
| |
| string_init(&out_s); |
| |
| if (opts == NULL) |
| { |
| opts = &my_opts; |
| opts->bounce_url = NULL; |
| opts->url_class = NULL; |
| opts->url_target = "_blank"; |
| opts->mailto_class = NULL; |
| opts->long_lines = 0; |
| opts->space_convert = 0; |
| opts->newlines_convert = 1; |
| opts->longline_width = 75; /* This hasn't been used in a while, actually */ |
| opts->check_ascii_art = 1; |
| opts->link_name = NULL; |
| } |
| |
| do |
| { |
| if (opts->check_ascii_art) |
| { |
| formatting = has_space_formatting (src, slen); |
| if (formatting) opts->space_convert = 1; |
| } |
| if (formatting == 2) |
| { |
| /* Do <pre> formatting */ |
| opts->newlines_convert = 1; |
| err = string_append (&out_s, "<tt>"); |
| if (err != STATUS_OK) break; |
| err = split_and_convert(src, slen, &out_s, opts); |
| if (err != STATUS_OK) break; |
| err = string_append (&out_s, "</tt>"); |
| if (err != STATUS_OK) break; |
| /* Strip white space at end of lines */ |
| strip_white_space_end (&out_s); |
| } |
| else |
| { |
| /* int nl = has_long_lines (src, slen); */ |
| err = split_and_convert(src, slen, &out_s, opts); |
| } |
| } while (0); |
| if (err != STATUS_OK) |
| { |
| string_clear (&out_s); |
| return nerr_pass (err); |
| } |
| if (out_s.buf == NULL) |
| { |
| *out = strdup(""); |
| } |
| else |
| { |
| *out = out_s.buf; |
| } |
| return STATUS_OK; |
| } |
| |
| NEOERR *html_escape_alloc (const char *src, int slen, |
| char **out) |
| { |
| return nerr_pass(neos_html_escape(src, slen, out)); |
| } |
| |
| /* Replace ampersand with iso-8859-1 character code */ |
| static unsigned char _expand_amp_8859_1_char (const char *s) |
| { |
| if (s[0] == '\0') |
| return 0; |
| |
| switch (s[0]) { |
| case '#': |
| if (s[1] == 'x') return strtol (s+2, NULL, 16); |
| return strtol (s+1, NULL, 10); |
| case 'a': |
| if (!strcmp(s, "agrave")) return 0xe0; /* à */ |
| if (!strcmp(s, "aacute")) return 0xe1; /* á */ |
| if (!strcmp(s, "acirc")) return 0xe2; /* â */ |
| if (!strcmp(s, "atilde")) return 0xe3; /* ã */ |
| if (!strcmp(s, "auml")) return 0xe4; /* ä */ |
| if (!strcmp(s, "aring")) return 0xe5; /* å */ |
| if (!strcmp(s, "aelig")) return 0xe6; /* æ */ |
| if (!strcmp(s, "amp")) return '&'; |
| return 0; |
| case 'c': |
| if (!strcmp(s, "ccedil")) return 0xe7; /* ç */ |
| return 0; |
| case 'e': |
| if (!strcmp(s, "egrave")) return 0xe8; /* è */ |
| if (!strcmp(s, "eacute")) return 0xe9; /* é */ |
| if (!strcmp(s, "ecirc")) return 0xea; /* ê */ |
| if (!strcmp(s, "euml")) return 0xeb; /* ë */ |
| if (!strcmp(s, "eth")) return 0xf0; /* ð */ |
| return 0; |
| case 'i': |
| if (!strcmp(s, "igrave")) return 0xec; /* ì */ |
| if (!strcmp(s, "iacute")) return 0xed; /* í */ |
| if (!strcmp(s, "icirc")) return 0xee; /* î */ |
| if (!strcmp(s, "iuml")) return 0xef; /* ï */ |
| return 0; |
| case 'g': |
| if (!strcmp(s, "gt")) return '>'; |
| return 0; |
| case 'l': |
| if (!strcmp(s, "lt")) return '<'; |
| return 0; |
| case 'n': |
| if (!strcmp(s, "ntilde")) return 0xf1; /* ñ */ |
| if (!strcmp(s, "nbsp")) return ' '; |
| return 0; |
| case 'o': |
| if (!strcmp(s, "ograve")) return 0xf2; /* ò */ |
| if (!strcmp(s, "oacute")) return 0xf3; /* ó */ |
| if (!strcmp(s, "ocirc")) return 0xf4; /* ô */ |
| if (!strcmp(s, "otilde")) return 0xf5; /* õ */ |
| if (!strcmp(s, "ouml")) return 0xf6; /* ö */ |
| if (!strcmp(s, "oslash")) return 0xf8; /* ø */ |
| return 0; |
| case 'q': /* quot */ |
| if (!strcmp(s, "quot")) return '"'; |
| return 0; |
| case 's': |
| if (!strcmp(s, "szlig")) return 0xdf; /* ß */ |
| return 0; |
| case 't': |
| if (!strcmp(s, "thorn")) return 0xfe; /* þ */ |
| return 0; |
| case 'u': |
| if (!strcmp(s, "ugrave")) return 0xf9; /* ù */ |
| if (!strcmp(s, "uacute")) return 0xfa; /* ú */ |
| if (!strcmp(s, "ucirc")) return 0xfb; /* û */ |
| if (!strcmp(s, "uuml")) return 0xfc; /* ü */ |
| return 0; |
| case 'y': |
| if (!strcmp(s, "yacute")) return 0xfd; /* ý */ |
| |
| } |
| return 0; |
| } |
| |
| char *html_expand_amp_8859_1(const char *amp, |
| char *buf) |
| { |
| unsigned char ch; |
| |
| ch = _expand_amp_8859_1_char(amp); |
| if (ch == '\0') |
| { |
| if (!strcmp(amp, "copy")) return "(C)"; |
| return ""; |
| } |
| else { |
| buf[0] = (char)ch; |
| buf[1] = '\0'; |
| return buf; |
| } |
| } |
| |
| NEOERR *html_strip_alloc(const char *src, int slen, |
| char **out) |
| { |
| NEOERR *err = STATUS_OK; |
| STRING out_s; |
| int x = 0; |
| int strip_match = -1; |
| int state = 0; |
| char amp[10]; |
| int amp_start = 0; |
| char buf[10]; |
| int ampl = 0; |
| |
| string_init(&out_s); |
| err = string_append (&out_s, ""); |
| if (err) return nerr_pass (err); |
| |
| while (x < slen) |
| { |
| switch (state) { |
| case 0: |
| /* Default */ |
| if (src[x] == '&') |
| { |
| state = 3; |
| ampl = 0; |
| amp_start = x; |
| } |
| else if (src[x] == '<') |
| { |
| state = 1; |
| } |
| else |
| { |
| if (strip_match == -1) |
| { |
| err = string_append_char(&out_s, src[x]); |
| if (err) break; |
| } |
| } |
| x++; |
| break; |
| case 1: |
| /* Starting TAG */ |
| if (src[x] == '>') |
| { |
| state = 0; |
| } |
| else if (src[x] == '/') |
| { |
| } |
| else |
| { |
| } |
| x++; |
| break; |
| case 2: |
| /* In TAG */ |
| if (src[x] == '>') |
| { |
| state = 0; |
| } |
| x++; |
| break; |
| case 3: |
| /* In AMP */ |
| if (src[x] == ';') |
| { |
| amp[ampl] = '\0'; |
| state = 0; |
| err = string_append(&out_s, html_expand_amp_8859_1(amp, buf)); |
| if (err) break; |
| } |
| else |
| { |
| if (ampl < sizeof(amp)-1) |
| amp[ampl++] = tolower(src[x]); |
| else |
| { |
| /* broken html... just back up */ |
| x = amp_start; |
| err = string_append_char(&out_s, src[x]); |
| if (err) break; |
| state = 0; |
| } |
| } |
| x++; |
| break; |
| } |
| if (err) break; |
| } |
| |
| |
| if (err) |
| { |
| string_clear (&out_s); |
| return nerr_pass (err); |
| } |
| *out = out_s.buf; |
| return STATUS_OK; |
| } |