/ [swis] / trunk / src / word-split.c
To checkout: svn checkout http://svn.gnu.org.ua/sources/swis/trunk/src/word-split.c
Puszcza

Contents of /trunk/src/word-split.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 13 - (show annotations)
Thu Oct 4 12:10:46 2007 UTC (13 years, 8 months ago) by gray
File MIME type: text/plain
File size: 4584 byte(s)
src/version.c, src/readname.c: New files
src/word-split.c, src/html-strip.l: Add support for --from-file
(-T) option. 
Makefile.am (libswis.a): New goal
swis.h: Add new prototypes
gnulib.modules: Add obstack

1 /* This file is part of SWIS
2 Copyright (C) 2007 Sergey Poznyakoff
3
4 SWIS is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 3, or (at your option)
7 any later version.
8
9 SWIS is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
13
14 You should have received a copy of the GNU General Public License
15 along with SWIS. If not, see <http://www.gnu.org/licenses/>. */
16
17 #include "swis.h"
18
19 enum {
20 PROGNAME_OPTION,
21 };
22
23 struct option options[] = {
24 { "progname", required_argument, NULL, PROGNAME_OPTION },
25 { "debug", no_argument, NULL, 'd' },
26 { "help", no_argument, NULL, 'h' },
27 { "version", no_argument, NULL, 'v' },
28 { "output", required_argument, NULL, 'o' },
29 { "files-from", required_argument, NULL, 'T' },
30 { "null", no_argument, NULL, '0' },
31 { NULL }
32 };
33
34 void
35 usage ()
36 {
37 printf ("Usage: word-split [OPTIONS] [FILES...]\n");
38 printf ("Split UTF-8 input into words\n");
39 printf ("\nOptions are:\n");
40 /* printf (" -d, --debug output debugging info\n"); */
41 printf (" -o, --output=FILE direct output to FILE instead of stdout\n");
42 printf (" -t, --tag preserve file name tags\n");
43 printf (" -T, --from-file=FILE read input file names from FILE\n");
44 printf (" -0, --null -T reads null-terminated names\n");
45 printf ("\n");
46 printf (" -h, --help print this help list\n");
47 printf (" -v, --version print program version and exit\n");
48 printf ("\n");
49 printf ("Report bugs to <%s>\n", PACKAGE_BUGREPORT);
50 }
51
52 char **input_file;
53 FILE *input;
54 FILE *output;
55 int tag_option;
56
57 int
58 open_input ()
59 {
60 if (input_file && *input_file)
61 {
62 char *name = *input_file++;
63 if (name[0] == '-' && name[1] == 0)
64 input = stdin;
65 else
66 {
67 input = fopen (name, "r");
68 if (!input)
69 error (1, errno, "cannot open input file %s", name);
70 }
71 return 0;
72 }
73 return 1;
74 }
75
76 static int after_delim_output = 1;
77
78 int
79 word_split ()
80 {
81 unsigned wc;
82 int after_newline;
83
84 while ((wc = fgetc (input)) != EOF)
85 {
86 if (wc < 0x80)
87 {
88 if (tag_option)
89 {
90 if (after_newline && wc == '>')
91 {
92 do
93 fputc (wc, output);
94 while ((wc = fgetc (input)) != EOF && wc != '\n');
95 fputc ('\n', output);
96 after_delim_output = 1;
97 after_newline = 0;
98 continue;
99 }
100 after_newline = wc == '\n';
101 }
102
103 if (isalnum (wc))
104 {
105 fputc (wc, output);
106 }
107 else
108 {
109 if (!after_delim_output)
110 {
111 fputc ('\n', output);
112 after_delim_output = 1;
113 }
114 continue;
115 }
116 }
117 else
118 {
119 int count;
120
121 if (0xc2 <= wc && wc <= 0xdf)
122 count = 2;
123 else if (0xe0 <= wc && wc <= 0xef)
124 count = 3;
125 else if (0xf0 <= wc && wc <= 0xf4)
126 count = 4;
127 else
128 {
129 /* FIXME: don't know what to do */
130 error (0, 0, "unknown UTF-8 char: %x", wc);
131 fputc (wc, output);
132 continue;
133 }
134
135 fputc (wc, output);
136 while (--count)
137 {
138 if ((wc = fgetc (input)) == EOF)
139 {
140 error (1, 0, "unexpected end of file");
141 break;
142 }
143 fputc (wc, output);
144 }
145 }
146 after_delim_output = 0;
147 }
148 return !open_input ();
149 }
150
151 int
152 main (int argc, char **argv)
153 {
154 int c;
155
156 program_name = argv[0];
157
158 while ((c = getopt_long (argc, argv, "0dho:T:tv", options, NULL)) != EOF)
159 {
160 switch (c)
161 {
162 case '0':
163 filename_terminator = 0;
164 break;
165
166 case 'd':
167 error (0, 0, "warning: the --debug option is not yet supported");
168 /* FIXME */
169 break;
170
171 case PROGNAME_OPTION:
172 program_name = optarg;
173 break;
174
175 case 'h':
176 usage ();
177 exit (0);
178
179 case 'o':
180 output = fopen (optarg, "w");
181 if (!output)
182 error (1, errno, "cannot open output file %s", optarg);
183 break;
184
185 case 'T':
186 read_names_from_file (optarg);
187 break;
188
189 case 't':
190 tag_option = 1;
191 break;
192
193 case 'v':
194 swis_version (stdout, "word-split");
195 exit (0);
196
197 default:
198 exit (1);
199 }
200 }
201
202 argc -= optind;
203 argv += optind;
204
205 update_argcv (&argc, &argv);
206
207 if (argc)
208 {
209 input_file = argv;
210 open_input ();
211 }
212
213 if (!input)
214 input = stdin;
215 if (!output)
216 output = stdout;
217
218 while (word_split ())
219 ;
220
221 exit (0);
222 }
223
224
225
226

Send suggestions and bug reports to Sergey Poznyakoff
ViewVC Help
Powered by ViewVC 1.1.20