/* GF C Bindings
Copyright (C) 2010 Kevin Kofler
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, see .
*/
#include "gf_lexing.h"
#include
#include
#include
#include
typedef char **(*GF_Lexer)(const char *str);
typedef char *(*GF_Unlexer)(char **arr);
static inline void freev(char **p)
{
char **q = p;
while (*q)
free(*(q++));
free(p);
}
static char **words(const char *str)
{
unsigned char *buf = (unsigned char *) strdup(str);
unsigned char *p = buf, *q;
char **result, **r;
size_t count = 0u;
while (isspace(*p)) p++;
q = p;
if (*p) count++;
while (*p) {
if (isspace(*p)) {
*(p++) = 0;
while (isspace(*p)) *(p++) = 0;
if (*p) count++;
} else p++;
}
r = result = malloc((count+1)*sizeof(char *));
if (count) while (1) {
*(r++) = strdup((char *) q);
if (!--count) break;
while (*q) q++;
while (!*q) q++;
}
*r = NULL;
return result;
}
static char *unwords(char **arr)
{
size_t len = 0u;
char **p = arr, *result, *r;
while (*p)
len += strlen(*(p++)) + 1u;
if (!len) return calloc(1, 1);
r = result = malloc(len);
p = arr;
while (1) {
size_t l = strlen(*p);
strcpy(r, *(p++));
if (!*p) break;
r += l;
*(r++) = ' ';
}
return result;
}
static char **lines(const char *str)
{
unsigned char *buf = (unsigned char *) strdup(str);
unsigned char *p = buf, *q;
char **result, **r;
size_t count = 0u;
while (*p == '\n') p++;
q = p;
if (*p) count++;
while (*p) {
if (*p == '\n') {
*(p++) = 0;
while (*p == '\n') *(p++) = 0;
if (*p) count++;
} else p++;
}
r = result = malloc((count+1)*sizeof(char *));
if (count) while (1) {
*(r++) = strdup((char *) q);
if (!--count) break;
while (*q) q++;
while (!*q) q++;
}
*r = NULL;
return result;
}
static char *unlines(char **arr)
{
size_t len = 0u;
char **p = arr, *result, *r;
while (*p)
len += strlen(*(p++)) + 1u;
if (!len) return calloc(1, 1);
r = result = malloc(len);
p = arr;
while (1) {
size_t l = strlen(*p);
strcpy(r, *(p++));
if (!*p) break;
r += l;
*(r++) = '\n';
}
return result;
}
static char *appLexer(GF_Lexer f, const char *str)
{
char **arr = f(str), **p = arr, *result;
int ofs = 0;
while (*p && **p) p++;
while (*p) {
if (**p) p[-ofs] = *p; else ofs++;
p++;
}
p[-ofs] = NULL;
result = unwords(arr);
freev(arr);
return result;
}
static char *appUnlexer(GF_Unlexer f, const char *str)
{
char **arr = lines(str), **p = arr, *result;
while (*p) {
char **warr = words(*p);
free(*p);
*(p++) = f(warr);
freev(warr);
}
result = unlines(arr);
freev(arr);
return result;
}
static inline int isPunct(char c)
{
return c && strchr(".?!,:;", c);
}
static inline int isMajorPunct(char c)
{
return c && strchr(".?!", c);
}
static inline int isMinorPunct(char c)
{
return c && strchr(",:;", c);
}
static char *charToStr(char c)
{
char *result = malloc(2), *p = result;
*(p++) = c;
*p = 0;
return result;
}
static char **lexChars(const char *str)
{
char **result = malloc((strlen(str)+1)*sizeof(char *)), **r = result;
const char *p = str;
while (*p) {
if (!isspace(*p)) *(r++) = charToStr(*p);
p++;
}
*r = NULL;
return result;
}
static char **lexText(const char *str)
{
char **result = malloc((strlen(str)+1)*sizeof(char *)), **r = result;
const char *p = str;
int uncap = 1;
while (*p) {
if (isMajorPunct(*p)) {
*(r++) = charToStr(*(p++));
uncap = 1;
} else if (isMinorPunct(*p)) {
*(r++) = charToStr(*(p++));
uncap = 0;
} else if (isspace(*p)) {
p++;
uncap = 0;
} else {
const char *q = p;
char *word;
size_t l;
while (*p && !isspace(*p) && !isPunct(*p)) p++;
l = p - q;
word = malloc(l + 1);
strncpy(word, q, l);
word[l] = 0;
if (uncap) *word = tolower(*word);
*(r++) = word;
uncap = 0;
}
}
*r = NULL;
return result;
}
static char *unlexText(char **arr)
{
size_t len = 0u;
char **p = arr, *result, *r;
int cap = 1;
while (*p)
len += strlen(*(p++)) + 1u;
if (!len) return calloc(1, 1);
r = result = malloc(len);
p = arr;
while (1) {
size_t l = strlen(*p);
char *word = *(p++);
if (*word == '"' && word[l-1] == '"') word++, l--;
strncpy(r, word, l);
if (cap) *r = toupper(*r);
if (!*p) break;
r += l;
if (isPunct(**p) && !(*p)[1]) {
*(r++) = **p;
if (!p[1]) break;
cap = isMajorPunct(**(p++));
} else cap = 0;
*(r++) = ' ';
}
*r = 0;
return result;
}
static char *stringop_chars(const char *str)
{
return appLexer(lexChars, str);
}
static char *stringop_lextext(const char *str)
{
return appLexer(lexText, str);
}
static char *stringop_words(const char *str)
{
return appLexer(words, str);
}
static char *stringop_unlextext(const char *str)
{
return appUnlexer(unlexText, str);
}
static char *stringop_unwords(const char *str)
{
return appUnlexer(unwords, str);
}
GF_StringOp gf_stringOp(const char *op)
{
if (!strcmp(op, "chars")) return stringop_chars;
if (!strcmp(op, "lextext")) return stringop_lextext;
if (!strcmp(op, "words")) return stringop_words;
if (!strcmp(op, "unlextext")) return stringop_unlextext;
if (!strcmp(op, "unwords")) return stringop_unwords;
return NULL;
}