#!patch
This patch adds HTML mode to Ispell-3.1.20, supporting
Javascript and HTML-comments, and optionally
support for ampersand encoded entities, as defined in
http://uts.cc.utexas.edu/~churchh/latin1.html
It was written in 1995 by Gerry Tierney, and extended with
ampersand-conversion by Casper Maarbjerg, 1997, distributed at:
http://www.nyx.net/~cmaarbj/ispell/
Installation:
Grab a fresh archive of Ispell-3.1.20 at:
ftp://ftp.cs.ucla.edu/pub/ispell-3.1
ftp://ftp.math.orst.edu/pub/ispell-3.1 or
ftp://ftp.nl.net/pub/textproc/ispell (Europe)
gzip -dc ispell-3.1.20.tar.Z | tar xvf -
cd ispell-3.1
gzip -dc ispell-iso-html.patch.gz | patch -lF3
Edit local.h for your preferences. When upgrading, the
current configuration can be retrieved with ispell -vv
If you want ampersand conversion, you need to add:
#define HTSPECIAL
#undef NO8BIT
Run make; test with -d ./languages/american/americanmed+ or whatever
dictionary you use. (if not yet installed into the paths defined in
local.h)
Casper Maarbjerg, May 7, 2000
http://www.nyx.net/~cmaarbj/
---+*+---
Index: correct.c
===================================================================
*** correct.c 1995/10/12 19:04:06 1.1.1.1
--- correct.c 1998/11/15 15:09:21 1.1.1.3
***************
*** 50,55 ****
--- 50,62 ----
/*
* $Log: correct.c,v $
+ *
+ * HTML-entities added by Casper Maarbjerg, 1997/05/16 as listed in
+ * http://uts.cc.utexas.edu/~churchh/latin1.html
+ *
+ * Line added by Gerry Tierney to reset insidehtml flag for each new
+ * file in case a tag was left open by a previous file. 10/14/95
+ *
* Revision 1.59 1995/08/05 23:19:43 geoff
* Fix a bug that caused offsets for long lines to be confused if the
* line started with a quoting uparrow.
***************
*** 233,238 ****
--- 240,248 ----
int bufsize;
int ch;
+ /* line added by Gerry Tierney */
+ insidehtml = 0;
+
for (bufno = 0; bufno < contextsize; bufno++)
contextbufs[bufno][0] = '\0';
***************
*** 295,301 ****
--- 305,315 ----
char * start_l2;
char * begintoken;
+ #ifdef HTSPECIAL
+ begintoken = ctok_start == NULL ? contextbufs[0] : ctok_start;
+ #else
begintoken = *curchar - strlen (ctok);
+ #endif
if (icharlen (itok) <= minword)
return; /* Accept very short words */
***************
*** 374,380 ****
--- 388,398 ----
if (start_l2 < contextbufs[0])
start_l2 = contextbufs[0];
}
+ #ifdef HTSPECIAL
+ show_line (start_l2, begintoken, *curchar - begintoken );
+ #else
show_line (start_l2, begintoken, (int) strlen (ctok));
+ #endif
if (minimenusize != 0)
{
***************
*** 594,599 ****
--- 612,627 ----
ichar = SET_SIZE + laststringch;
else
ichar = chartoichar (ch);
+ #ifdef HTSPECIAL
+ if (htmlflag == 1 && ch == '&' && !vflag && len == 1)
+ {
+ ch = html_ent(cp);
+ if (output)
+ (void) putchar (ch);
+ return 1;
+ }
+ else
+ #endif
if (!vflag && iswordch (ichar) && len == 1)
{
if (output)
***************
*** 1604,1609 ****
--- 1632,1642 ----
if (**cc == '\0')
break;
if (!aflag && !lflag)
+ #ifdef HTSPECIAL
+ if (htmlflag == 1 && (unsigned char)**cc >= FIRST_ISO)
+ fprintf(outfile, iso_ent[(unsigned char) **cc - FIRST_ISO]);
+ else
+ #endif
(void) putc (**cc, outfile);
(*cc)++;
}
Index: defmt.c
===================================================================
*** defmt.c 1995/10/12 19:04:06 1.1.1.1
--- defmt.c 2000/05/07 19:25:36 1.3
***************
*** 54,59 ****
--- 54,66 ----
/*
* $Log: defmt.c,v $
+ *
+ * ISO-character de-/en- coding in html mode added 1997/05/16
+ * by Casper Maarbjerg, http://www.nyx.net/~cmaarbj/
+ *
+ * html-mode code added by Gerry Tierney
+ * 14th of Oct '95
+ *
* Revision 1.41 1995/08/05 23:19:47 geoff
* Get rid of an obsolete comment. Add recognition of documentclass and
* usepackage for Latex2e support.
***************
*** 140,145 ****
--- 147,153 ----
static void TeX_open_paren P ((char ** bufp));
static void TeX_skip_check P ((char ** bufp));
static int TeX_strncmp P ((char * a, char * b, int n));
+ char * htmlword P ((unsigned char *source));
#define ISTEXTERM(c) (((c) == TEXLEFTCURLY) || \
((c) == TEXRIGHTCURLY) || \
***************
*** 160,165 ****
--- 168,192 ----
static int save_math_mode;
static char save_LaTeX_Mode;
+ static char *skiptag(buf, tagend, taglen) /* Skip past specific tag */
+ char * buf;
+ char * tagend;
+ int taglen;
+ {
+ while(*buf)
+ {
+ if (*buf != *tagend && ++buf)
+ continue;
+ if (strncasecmp(buf, tagend, taglen) && ++buf)
+ continue;
+ buf += taglen;
+ insidehtml = 0;
+ break;
+ }
+ return(buf);
+ }
+
+ /* parameters changed by Gerry Tierney to include the output file */
static char * skiptoword (bufp) /* Skip to beginning of a word */
char * bufp;
{
***************
*** 170,175 ****
--- 197,278 ----
|| (tflag && (math_mode & 1)))
)
{
+ /* Start of modifications by Gerry Tierney */
+ /* We first check for an end-quote character if we are checking
+ inside of an alt attribute. If we find one we ignore the
+ rest of the tag */
+ if (insidehtml == -1 && *bufp == '\"')
+ {
+ insidehtml = 0;
+ while (*bufp != '>' && *bufp != '\0')
+ bufp++;
+ if (*bufp == '\0')
+ insidehtml = 1;
+ }
+ /* If we are checking a html file we want to ignore any
+ HTML tags. These should start with a '<'
+ and end with a '>' so we simply skip over anything
+ between these two symbols. If we reach the end of the line
+ before finding a matching '>' we set a flag 'insidehtml' */
+ if (htmlflag == 1 && *bufp == '<')
+ {
+ /* Found start of html tag, if it is a script tag,
+ * skip until end of script */
+ if (insidehtml == 2 || (strncasecmp(bufp,"", 9) - currentchar;
+ copyout(¤tchar, tlen);
+ }
+ else if (insidehtml == 3) { /* filtering comments */
+ tlen = skiptag(currentchar, "-->", 3) - currentchar;
+ copyout(¤tchar, tlen);
+ }
+ }
+ /* End of modifications by Gerry Tierney */
+
for ( ; ; )
{
p = skiptoword (currentchar);
+ #ifdef HTSPECIAL
+ ctok_start = p;
+ #endif
if (p != currentchar)
copyout (¤tchar, p - currentchar);
***************
*** 453,458 ****
--- 620,642 ----
p = ctoken;
endp = skipoverword (currentchar);
+ #ifdef HTSPECIAL
+ if (htmlflag == 1) /* We are honoring the ISO-HTML entities, */
+ { /* and have to convert to ISO before lookup */
+ while (currentchar < endp && p < ctoken + sizeof ctoken - 1)
+ {
+ if (*currentchar == '&')
+ {
+ *p++ = html_ent(¤tchar);
+ if (currentchar > endp)
+ currentchar = endp;
+ }
+ else
+ *p++ = *currentchar++;
+ }
+ }
+ else
+ #endif
while (currentchar < endp && p < ctoken + sizeof ctoken - 1)
*p++ = *currentchar++;
*p = 0;
***************
*** 545,550 ****
--- 729,739 ----
}
}
if (!aflag && !lflag)
+ #ifdef HTSPECIAL
+ if (htmlflag == 1) /* Translate into output file */
+ (void) fprintf (ofile, "%s", htmlword(ctoken));
+ else
+ #endif
(void) fprintf (ofile, "%s", ctoken);
}
***************
*** 899,901 ****
--- 1088,1265 ----
}
return cmpresult;
}
+
+
+ #ifdef HTSPECIAL
+
+ /*
+ * Code to convert from / to ISO HTML-entities.
+ *
+ * Decoding of alphabetic entities is performed by two table lookups,
+ * one for each of the first two characters after the `&'.
+ *
+ * The first lookup decides which string to use for the second lookup,
+ * and if both match, the corresponding position in the isochar array
+ * holds the character value.
+ *
+ * After the 8-bit value is determined, the input is verified against
+ * the iso_ent array, using strncmp(), and in case of mismatch the
+ * function returns the input character unconverted.
+ *
+ * The alternate numeric form of nnn; is also decoded by atoi, and
+ * checked for sanity, but will be converted to the name-form on output.
+ *
+ * Encoding is performed by htmlword on characters between FIRST_ISO and 255,
+ * and the iso_ent table must hold an entry for each.
+ */
+ static char *Y_key = "ACEINOTUYsaceinotuy"; /* Primary key */
+
+ static char *X_key[] = { /* Secondary key: */
+ "gacturE",
+ "c",
+ "gacuT",
+ "gacu",
+ "t",
+ "gactus",
+ "h",
+ "gacu",
+ "a",
+ "z",
+ "gacture",
+ "c",
+ "gacut",
+ "gacu",
+ "t",
+ "gactus",
+ "h",
+ "gacu",
+ "au"
+ };
+
+ static unsigned char *isochar[] = { /* 8-bit values of above table */
+ "\300\301\302\303\304\305\306",
+ "\307",
+ "\310\311\312\313\320",
+ "\314\315\316\317",
+ "\321",
+ "\322\323\324\325\326\330",
+ "\336",
+ "\331\332\333\334",
+ "\335",
+ "\337",
+ "\340\341\342\343\344\345\346",
+ "\347",
+ "\350\351\352\353\360",
+ "\354\355\356\357",
+ "\361",
+ "\362\363\364\365\366\370",
+ "\376",
+ "\371\372\373\374",
+ "\375\377",
+ };
+
+ /*
+ * Reference: http://uts.cc.utexas.edu/~churchh/latin1.html
+ */
+ char *iso_ent[] = { /* Valid HTML characters above 160 in numerical order */
+ " ", "¡", "¢", "£", "¤", "¥",
+ "¦", "§", "¨", "©", "ª", "«",
+ "¬", "", "®", "¯", "°", "±",
+ "²", "³", "´", "µ", "¶", "·",
+ "¸", "¹", "º", "»", "¼", "½",
+ "¾", "¿",
+ "À", "Á", "Â", "Ã", "Ä", "Å", "Æ",
+ "Ç",
+ "È", "É", "Ê", "Ë",
+ "Ì", "Í", "Î", "Ï",
+ "Ð", "Ñ",
+ "Ò", "Ó", "Ô", "Õ", "Ö", "×", "Ø",
+ "Ù", "Ú", "Û", "Ü",
+ "Ý", "Þ", "ß",
+ "à", "á", "â", "ã", "ä", "å", "æ",
+ "ç",
+ "è", "é", "ê", "ë",
+ "ì", "í", "î", "ï",
+ "ð", "ñ",
+ "ò", "ó", "ô", "õ", "ö", "÷", "ø",
+ "ù", "ú", "û", "ü",
+ "ý", "þ", "Ÿ"
+ };
+
+ /* Increment pointer past ignored entity, returning nonzero on success
+ */
+ int skip_ent(char **entity)
+ {
+ char **cpp;
+ int j, match = 0;
+
+ if (strncmp(*entity, "<" , 4) == 0 || strncmp(*entity, ">" , 4) == 0)
+ match = 3;
+ else if (strncmp(*entity, "&" , 5) == 0)
+ match = 4;
+ else if (strncmp(*entity, """ , 6) == 0)
+ match = 5;
+ else if (strncmp(*entity, " " , 6) == 0)
+ match = 5;
+ if (match)
+ *entity += match;
+ return(match);
+ }
+
+ /* Return 8-bit value of valid html-entity pointed to by *in, incrementing
+ * the pointer by the length of the tag.
+ * Only the first two alpha characters after '&' is tested, then the
+ * decoded char is verified against the iso_ent array.
+ */
+ int html_ent(char **in)
+ {
+ char *cp, *cp2, ch;
+ char *decoded;
+ int row, val, taglen = 1;
+
+ cp = cp2 = *in;
+ val = ch = **in;
+
+ if (*++cp && *cp == '#' && (*++cp == '1' || *cp == '2') && (val = atoi(cp)))
+ {
+ if (val > 255 || *++cp < '0' || *cp > '5' || *++cp < '0' || *cp > '9' || *++cp!=';')
+ val = 0;
+ else
+ taglen = 6; /* Validate numeric tag */
+ }
+ else
+ {
+ if ((cp = index(Y_key, *++cp2)) &&
+ (decoded = index(*(X_key+(row = cp-Y_key)), *++cp2)) &&
+ (val = isochar[row] [ decoded - X_key[row]]) >= FIRST_ISO)
+ taglen = strlen(iso_ent[val - FIRST_ISO]);
+ if (val to
+ * allow checking of html code. Adds -h switch and checking for
+ * html files by .html or .htm extension.
+ * 14th of October 1995
+ *
* Revision 1.133 1995/10/11 04:30:29 geoff
* Get rid of an unused variable.
*
***************
*** 298,304 ****
* ABCDEFGHIJKLMNOPQRSTUVWXYZ 0123456789
* ^^^^ ^^^ ^ ^^ ^^
* abcdefghijklmnopqrstuvwxyz
! * ^^^^^^ ^^^ ^ ^^ ^^^
*/
arglen = strlen (*argv);
switch ((*argv)[1])
--- 304,312 ----
* ABCDEFGHIJKLMNOPQRSTUVWXYZ 0123456789
* ^^^^ ^^^ ^ ^^ ^^
* abcdefghijklmnopqrstuvwxyz
! * ^^^^^^ ^ ^^^ ^ ^^ ^^^
! *
! * -h flag used by Gerry Tierney for html-mode
*/
arglen = strlen (*argv);
switch ((*argv)[1])
***************
*** 438,443 ****
--- 446,454 ----
(void) printf ("\tNO8BIT\n");
#else /* NO8BIT */
(void) printf ("\t!NO8BIT (8BIT)\n");
+ #ifdef HTSPECIAL
+ (void) printf ("\tHTSPECIAL \"(ISO-HTML mode)\"\n");
+ #endif
#endif /* NO8BIT */
(void) printf ("\tNRSPECIAL = \"%s\"\n", NRSPECIAL);
(void) printf ("\tOLDPAFF = \"%s\"\n", OLDPAFF);
***************
*** 488,493 ****
--- 499,505 ----
if (arglen > 2)
usage ();
tflag = 0; /* nroff/troff mode */
+ htmlflag = -1; /* non-html mode */
deftflag = 0;
if (preftype == NULL)
preftype = "nroff";
***************
*** 496,505 ****
--- 508,526 ----
if (arglen > 2)
usage ();
tflag = 1;
+ htmlflag = -1; /* non-html mode */
deftflag = 1;
if (preftype == NULL)
preftype = "tex";
break;
+ /* -h option to enable HTML-mode added by Gerry Tierney */
+ case 'h':
+ if (arglen > 2)
+ usage ();
+ tflag = 0; /* non-TeX mode */
+ deftflag = 0;
+ htmlflag = 1; /* Html-Mode */
+ break;
case 'T': /* Set preferred file type */
p = (*argv)+2;
if (*p == '\0')
***************
*** 810,816 ****
if (tflag < 0)
tflag =
(cp = rindex (filename, '.')) != NULL && strcmp (cp, ".tex") == 0;
!
if (prefstringchar < 0)
{
defdupchar =
--- 831,837 ----
if (tflag < 0)
tflag =
(cp = rindex (filename, '.')) != NULL && strcmp (cp, ".tex") == 0;
!
if (prefstringchar < 0)
{
defdupchar =
***************
*** 818,823 ****
--- 839,851 ----
if (defdupchar < 0)
defdupchar = 0;
}
+ /* Modification by Gerry Tierney to set hmtl-mode
+ * based on file extension */
+ if (htmlflag == 0)
+ htmlflag =
+ (cp = rindex (filename, '.')) != NULL &&
+ ( strcmp (cp, ".html") == 0 ||
+ strcmp (cp, ".htm") == 0);
if ((infile = fopen (filename, "r")) == NULL)
{
Index: ispell.h
===================================================================
*** ispell.h 1995/10/12 19:04:08 1.1.1.1
--- ispell.h 1998/11/15 15:09:21 1.1.1.3
***************
*** 42,47 ****
--- 42,57 ----
/*
* $Log: ispell.h,v $
+ *
+ * Patch by Casper Maarbjerg, http://www.nyx.net/~cmaarbj/
+ * 1997/05/19, for ISO HTML-entity conversion in html mode.
+ * Added variable ctok_start to hold the start of raw html word.
+ * changes wrapped in "#ifdef HTSPECIAL".
+ *
+ * Patch by Gerry Tierney
+ * 1995/10/14
+ * Added variables htmlflag and insidehtml for use in html-mode
+ *
* Revision 1.68 1995/03/06 02:42:41 geoff
* Be vastly more paranoid about parenthesizing macro arguments. This
* fixes a bug in defmt.c where a complex argument was passed to
***************
*** 623,628 ****
--- 633,658 ----
INIT (int deftflag, -1); /* NZ for TeX mode by default */
INIT (int tflag, DEFTEXFLAG); /* NZ for TeX mode in current file */
INIT (int prefstringchar, -1); /* Preferred string character type */
+ /* The following two definitions added by
+ * Gerry Tierney
+ * 14th Oct 95
+ */
+ INIT (int htmlflag, 0); /* HTML-checking state.
+ * 1=enable html-mode,
+ * 0=enable html-mode based on filename,
+ * -1=disable html-mode */
+ INIT (int insidehtml, 0); /* Flag to indicate that the current html
+ * tag has spanned more than one line */
+ /* End of Gerry's Interference */
+ #ifdef HTSPECIAL /* decode "" for HTML-ISO characters */
+ #ifdef NO8BIT
+ #error HTSPECIAL requires NO8BIT to be undefined !
+ #endif
+ #define FIRST_ISO 160 /* First 8-bit code of valid HTML entities */
+ extern char *iso_ent[]; /* HTML entities defined in defmt.c */
+ extern int html_ent P ((char **in));
+ INIT (char *ctok_start, NULL); /* Remember start of raw HTML word */
+ #endif
INIT (int terse, 0); /* NZ for "terse" mode */