/ [swis] / trunk / src / html-strip.l
To checkout: svn checkout http://svn.gnu.org.ua/sources/swis/trunk/src/html-strip.l
Puszcza

Contents of /trunk/src/html-strip.l

Parent Directory Parent Directory | Revision Log Revision Log


Revision 13 - (show annotations)
Thu Oct 4 12:10:46 2007 UTC (13 years, 8 months ago) by gray
File size: 8114 byte(s)
src/version.c, src/readname.c: New files
src/word-split.c, src/html-strip.l: Add support for --from-file
(-T) option. 
Makefile.am (libswis.a): New goal
swis.h: Add new prototypes
gnulib.modules: Add obstack

1 %{
2 /* This file is part of SWIS
3 Copyright (C) 2007 Sergey Poznyakoff
4
5 SWIS is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 3, or (at your option)
8 any later version.
9
10 SWIS is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
14
15 You should have received a copy of the GNU General Public License
16 along with SWIS. If not, see <http://www.gnu.org/licenses/>. */
17
18 #include "swis.h"
19 #include <iconv.h>
20
21 char inbuf[80];
22 char outbuf[80];
23 size_t idx;
24 #define INVALID_ICONV_CD (iconv_t) -1
25 iconv_t cd = INVALID_ICONV_CD;
26
27 static void
28 parse_content_type ()
29 {
30 char *start, *p;
31 size_t len;
32 char *buf;
33
34 for (len = 1, start = yytext + yyleng - 2;
35 start > yytext && start[-1] != '"'; start--, len++)
36 ;
37
38 len = strlen (start) - 1;
39 buf = xmalloc (len + 1);
40 memcpy (buf, start, len);
41 buf[len] = 0;
42 p = strchr (buf, ';');
43 if (p)
44 {
45 for (p++; *p && isspace (*p); p++)
46 ;
47 if (strncasecmp (p, "charset=", 8) == 0)
48 {
49 p += 8;
50 start = p;
51 while (*p)
52 {
53 if (*p == ';' || isspace (*p))
54 {
55 *p = 0;
56 break;
57 }
58 p++;
59 }
60 memmove (buf, start, p - start + 1);
61 if (strcasecmp (buf, "utf-8"))
62 {
63 if (yy_flex_debug)
64 fprintf (stderr, "enabling conversion %s->%s\n", buf,"utf-8");
65 cd = iconv_open ("UTF-8", buf);
66 if (cd == INVALID_ICONV_CD)
67 error (0, errno, "cannot convert from %s", buf);
68 }
69 }
70 }
71 free (buf);
72 }
73
74 int in_body;
75
76 void
77 output (const char *str)
78 {
79 if (in_body)
80 fputs (str, yyout);
81 }
82
83
84 void
85 convert_output ()
86 {
87 char *outptr = outbuf;
88 size_t outsize = sizeof (outbuf);
89 char *inptr = inbuf;
90 size_t insize;
91 size_t rc;
92
93 insize = idx;
94 rc = iconv (cd, (ICONV_CONST char**)&inptr, &insize, &outptr, &outsize);
95 if (outptr != outbuf)
96 {
97 int saved_errno = errno;
98 if (fwrite (outbuf, 1, outptr - outbuf, yyout) < outptr - outbuf)
99 error (1, errno, "write error");
100 errno = saved_errno;
101 }
102 if (rc == (size_t) -1)
103 {
104 if (errno == EILSEQ)
105 error (1, 0, "cannot convert \"%.*s\"", idx, inbuf);
106 else if (errno == EINVAL)
107 {
108 memmove (inbuf, inptr, insize);
109 idx = insize;
110 }
111 else if (errno != E2BIG)
112 error (1, errno, "cannot convert");
113 }
114 else
115 idx = 0;
116 }
117
118
119 %}
120
121 %option case-insensitive 8bit
122
123 %x ELEMENT
124
125 WS [ \t]+
126 %%
127 <INITIAL>{
128 "<meta"{WS}"http-equiv=\"Content-Type\""{WS}"content=\""[^\"]*"\"" {
129 parse_content_type ();
130 BEGIN (ELEMENT);
131 }
132 "<body" { in_body = 1; BEGIN (ELEMENT); }
133 "</body" { in_body = 0; BEGIN (ELEMENT); }
134 "<" { BEGIN (ELEMENT); }
135 "&middot;" output ("·");
136 "&Agrave;" output (yytext[1] == 'A' ? "À" : "à");
137 "&Aacute;" output (yytext[1] == 'A' ? "Á" : "á");
138 "&Acirc;" output (yytext[1] == 'A' ? "Â" : "â");
139 "&Atilde;" output (yytext[1] == 'A' ? "Ã" : "ã");
140 "&Auml;" output (yytext[1] == 'A' ? "Ä" : "ä");
141 "&Aring;" output (yytext[1] == 'A' ? "Å" : "å");
142 "&AElig;" output (yytext[1] == 'A' ? "Æ" : "æ");
143 "&Ccedil;" output (yytext[1] == 'C' ? "Ç" : "ç");
144 "&Egrave;" output (yytext[1] == 'E' ? "È" : "è");
145 "&Eacute;" output (yytext[1] == 'E' ? "É" : "é");
146 "&Ecirc;" output (yytext[1] == 'E' ? "Ê" : "ê");
147 "&Euml;" output (yytext[1] == 'E' ? "Ë" : "ë");
148 "&Igrave;" output (yytext[1] == 'I' ? "Ì" : "ì");
149 "&Iacute;" output (yytext[1] == 'I' ? "Í" : "í");
150 "&Icirc;" output (yytext[1] == 'I' ? "Î" : "î");
151 "&Iuml;" output (yytext[1] == 'I' ? "Ï" : "ï");
152 "&Dstrok;" output ("Ð");
153 "&Ntilde;" output (yytext[1] == 'N' ? "Ñ" : "ñ");
154 "&Ograve;" output (yytext[1] == 'O' ? "Ò" : "ò");
155 "&Oacute;" output (yytext[1] == 'O' ? "Ó" : "ó");
156 "&Ocirc;" output (yytext[1] == 'O' ? "Ô" : "ô");
157 "&Otilde;" output (yytext[1] == 'O' ? "Õ" : "õ");
158 "&Ouml;" output (yytext[1] == 'O' ? "Ö" : "ö");
159 "&Oslash;" output (yytext[1] == 'O' ? "Ø" : "ø");
160 "&Ugrave;" output (yytext[1] == 'U' ? "Ù" : "ù");
161 "&Uacute;" output (yytext[1] == 'U' ? "Ú" : "ú");
162 "&Ucirc;" output (yytext[1] == 'U' ? "Û" : "û");
163 "&Uuml;" output (yytext[1] == 'U' ? "Ü" : "ü");
164 "&Yacute;" output (yytext[1] == 'Y' ? "Ý" : "ý");
165 "&THORN;" output (yytext[1] == 'T' ? "Þ" : "þ");
166 "&szlig;" output ("ß");
167 "&Eth;" output (yytext[1] == 'E' ? "Ð" : "ð");
168 "&yuml;" output ("ÿ");
169 "&#"[0-9]{1,5}";" { /* FIXME */; }
170 "&nbsp;" output (" ");
171 "&"[^;]*";" output (" ");
172 . {
173 if (in_body)
174 {
175 if (cd != INVALID_ICONV_CD)
176 {
177 inbuf[idx++] = yytext[0];
178 convert_output ();
179 }
180 else
181 ECHO;
182 }
183 }
184 }
185 <ELEMENT>{
186 \"[^\"]*\" ;
187 '[^']*' ;
188 ">" { BEGIN (INITIAL); output (" "); }
189 . ;
190 }
191 %%
192
193 char **input_file;
194 int tag_option;
195
196 int
197 open_input ()
198 {
199 if (input_file && *input_file)
200 {
201 char *name = *input_file++;
202 if (name[0] == '-' && name[1] == 0)
203 yyin = stdin;
204 else
205 {
206 yyin = fopen (name, "r");
207 if (!yyin)
208 error (1, errno, "cannot open input file %s", name);
209 }
210 if (tag_option)
211 {
212 fprintf (yyout ? yyout : stdout, "\n> %s\n", name);
213 }
214 return 0;
215 }
216 return 1;
217 }
218
219
220 int
221 yywrap()
222 {
223 if (cd != INVALID_ICONV_CD)
224 {
225 if (idx)
226 convert_output ();
227 iconv_close (cd);
228 cd = INVALID_ICONV_CD;
229 }
230 return open_input ();
231 }
232
233 enum {
234 PROGNAME_OPTION,
235 };
236
237 struct option options[] = {
238 { "progname", required_argument, NULL, PROGNAME_OPTION },
239 { "debug", no_argument, NULL, 'd' },
240 { "help", no_argument, NULL, 'h' },
241 { "version", no_argument, NULL, 'v' },
242 { "output", required_argument, NULL, 'o' },
243 { "tag", no_argument, NULL, 't' },
244 { "files-from", required_argument, NULL, 'T' },
245 { "null", no_argument, NULL, '0' },
246 { NULL }
247 };
248
249 void
250 usage ()
251 {
252 printf ("Usage: html-strip [OPTIONS] [FILES...]\n");
253 printf ("Strip off HTML tags from input files and convert them to UTF-8\n");
254 printf ("\nOptions are:\n");
255 printf (" -d, --debug output debugging info\n");
256 printf (" -o, --output=FILE direct output to FILE instead of stdout\n");
257 printf (" -t, --tag tag each output block with the source file name");
258 printf (" -T, --from-file=FILE read input file names from FILE\n");
259 printf (" -0, --null -T reads null-terminated names\n");
260 printf ("\n");
261 printf (" -h, --help print this help list\n");
262 printf (" -v, --version print program version and exit\n");
263 printf ("\n");
264 printf ("Report bugs to <%s>\n", PACKAGE_BUGREPORT);
265 }
266
267 int
268 main (int argc, char **argv)
269 {
270 int c;
271
272 program_name = argv[0];
273 yy_flex_debug = 0;
274
275 while ((c = getopt_long (argc, argv, "0dhoT::tv", options, NULL)) != EOF)
276 {
277 switch (c)
278 {
279 case '0':
280 filename_terminator = 0;
281 break;
282
283 case 'd':
284 yy_flex_debug = 1;
285 break;
286
287 case PROGNAME_OPTION:
288 program_name = optarg;
289 break;
290
291 case 'h':
292 usage ();
293 exit (0);
294
295 case 'o':
296 yyout = fopen (optarg, "w");
297 if (!yyout)
298 error (1, errno, "cannot open output file %s", optarg);
299 break;
300
301 case 'T':
302 read_names_from_file (optarg);
303 break;
304
305 case 't':
306 tag_option = 1;
307 break;
308
309 case 'v':
310 swis_version (stdout, "html-strip");
311 exit (0);
312
313 default:
314 exit (1);
315 }
316 }
317
318 argc -= optind;
319 argv += optind;
320
321 update_argcv (&argc, &argv);
322
323 if (argc)
324 {
325 input_file = argv;
326 open_input ();
327 }
328
329 while (yylex ())
330 ;
331 exit (0);
332 }
333
334
335 /* Local Variables: */
336 /* mode: c */
337 /* buffer-file-coding-system: utf-8 */
338 /* End: */

Send suggestions and bug reports to Sergey Poznyakoff
ViewVC Help
Powered by ViewVC 1.1.20