/* * risin.c * * Copyright (c) Chris Putnam 2003-2018 * * Source code released under the GPL version 2 * */ #include #include #include #include #include "str.h" #include "str_conv.h" #include "fields.h" #include "name.h" #include "title.h" #include "url.h" #include "utf8.h" #include "serialno.h" #include "reftypes.h" #include "bibformats.h" #include "generic.h" extern variants ris_all[]; extern int ris_nall; /***************************************************** PUBLIC: void risin_initparams() *****************************************************/ static int risin_readf( FILE *fp, char *buf, int bufsize, int *bufpos, str *line, str *reference, int *fcharset ); static int risin_processf( fields *risin, char *p, char *filename, long nref, param *pm ); static int risin_typef( fields *risin, char *filename, int nref, param *p ); static int risin_convertf( fields *risin, fields *info, int reftype, param *p ); void risin_initparams( param *p, const char *progname ) { p->readformat = BIBL_RISIN; p->charsetin = BIBL_CHARSET_DEFAULT; p->charsetin_src = BIBL_SRC_DEFAULT; p->latexin = 0; p->xmlin = 0; p->utf8in = 0; p->nosplittitle = 0; p->verbose = 0; p->addcount = 0; p->output_raw = 0; p->readf = risin_readf; p->processf = risin_processf; p->cleanf = NULL; p->typef = risin_typef; p->convertf = risin_convertf; p->all = ris_all; p->nall = ris_nall; slist_init( &(p->asis) ); slist_init( &(p->corps) ); if ( !progname ) p->progname = NULL; else p->progname = strdup( progname ); } /***************************************************** PUBLIC: int risin_readf() *****************************************************/ /* RIS definition of a tag is strict: character 1 = uppercase alphabetic character character 2 = uppercase alphabetic character or digit character 3 = space (ansi 32) character 4 = space (ansi 32) character 5 = dash (ansi 45) character 6 = space (ansi 32) some sources don't have a space at character 6 if there is no data (such as "ER -" records). Handle this. www.omicsonline.org mangles the RIS specification and puts _three_ spaces before the dash. Handle this too. */ static int is_ris_tag( char *buf ) { if (! (buf[0]>='A' && buf[0]<='Z') ) return 0; if (! (((buf[1]>='A' && buf[1]<='Z'))||(buf[1]>='0'&&buf[1]<='9')) ) return 0; if (buf[2]!=' ') return 0; if (buf[3]!=' ') return 0; /*...RIS fits specifications with two spaces */ if (buf[4]=='-') { if ( buf[5]==' ' || buf[5]=='\0' || buf[5]=='\n' || buf[5]=='\r' ) return 1; } /* ...extra space in tag? */ else if (buf[4]==' ') { if ( buf[5]=='-' && ( buf[6]==' ' || buf[6]=='\0' || buf[6]=='\n' || buf[6]=='\r' ) ) return 1; } return 0; } static int is_ris_start_tag( char *p ) { /* ...TY tag that fits specifications */ if ( !strncmp( p, "TY - ", 6 ) ) return 1; /* ...TY tag with an extra space? */ if ( !strncmp( p, "TY - ", 7 ) ) return 1; return 0; } static int is_ris_end_tag( char *p ) { /* ...ER tag that fits specifications */ if ( !strncmp( p, "ER -", 5 ) ) return 1; /* ...ER tag with an extra space? */ if ( !strncmp( p, "ER -", 6 ) ) return 1; return 0; } static int readmore( FILE *fp, char *buf, int bufsize, int *bufpos, str *line ) { if ( line->len ) return 1; else return str_fget( fp, buf, bufsize, bufpos, line ); } static int risin_readf( FILE *fp, char *buf, int bufsize, int *bufpos, str *line, str *reference, int *fcharset ) { int haveref = 0, inref = 0, readtoofar = 0; char *p; *fcharset = CHARSET_UNKNOWN; while ( !haveref && readmore( fp, buf, bufsize, bufpos, line ) ) { if ( str_is_empty( line ) ) continue; p = &( line->data[0] ); if ( utf8_is_bom( p ) ) { *fcharset = CHARSET_UNICODE; p += 3; } /* References are bounded by tags 'TY - ' && 'ER - ' */ if ( is_ris_start_tag( p ) ) { if ( !inref ) inref = 1; else { /* we've read too far.... */ readtoofar = 1; inref = 0; } } if ( is_ris_tag( p ) ) { if ( !inref ) { fprintf(stderr,"Warning. Tagged line not " "in properly started reference.\n"); fprintf(stderr,"Ignored: '%s'\n", p ); } else if ( is_ris_end_tag( p ) ) { inref = 0; } else { str_addchar( reference, '\n' ); str_strcatc( reference, p ); } } /* not a tag, but we'll append to last values ...*/ else if ( inref && !is_ris_end_tag( p ) ) { str_addchar( reference, '\n' ); str_strcatc( reference, p ); } if ( !inref && reference->len ) haveref = 1; if ( !readtoofar ) str_empty( line ); } if ( inref ) haveref = 1; return haveref; } /***************************************************** PUBLIC: int risin_processf() *****************************************************/ static char* process_untagged_line( str *value, char *p ) { while ( *p==' ' || *p=='\t' ) p++; while ( *p && *p!='\r' && *p!='\n' ) str_addchar( value, *p++ ); while ( *p=='\r' || *p=='\n' ) p++; return p; } static char* process_tagged_line( str *tag, str *value, char *p ) { int i = 0; while ( i<6 && *p && *p!='\n' && *p!='\r' ) { if ( i<2 ) str_addchar( tag, *p ); p++; i++; } while ( *p==' ' || *p=='\t' ) p++; while ( *p && *p!='\r' && *p!='\n' ) str_addchar( value, *p++ ); str_trimendingws( value ); while ( *p=='\n' || *p=='\r' ) p++; return p; } static int merge_tag_value( fields *risin, str *tag, str *value, int *tag_added ) { str *oldval; int n, status; if ( str_has_value( value ) ) { if ( *tag_added==1 ) { n = fields_num( risin ); if ( n>0 ) { oldval = fields_value( risin, n-1, FIELDS_STRP ); str_addchar( oldval, ' ' ); str_strcat( oldval, value ); if ( str_memerr( oldval ) ) return BIBL_ERR_MEMERR; } } else { status = fields_add( risin, str_cstr( tag ), str_cstr( value ), 0 ); if ( status!=FIELDS_OK ) return BIBL_ERR_MEMERR; *tag_added = 1; } } return BIBL_OK; } static int add_tag_value( fields *risin, str *tag, str *value, int *tag_added ) { int status; if ( str_has_value( value ) ) { status = fields_add( risin, str_cstr( tag ), str_cstr( value ), 0 ); if ( status!=FIELDS_OK ) return BIBL_ERR_MEMERR; *tag_added = 1; } else { *tag_added = 0; } return BIBL_OK; } static int risin_processf( fields *risin, char *p, char *filename, long nref, param *pm ) { int status, tag_added = 0, ret = 1; str tag, value; strs_init( &tag, &value, NULL ); while ( *p ) { /* ...tag, add entry */ if ( is_ris_tag( p ) ) { str_empty( &tag ); str_empty( &value ); p = process_tagged_line( &tag, &value, p ); status = add_tag_value( risin, &tag, &value, &tag_added ); if ( status!=BIBL_OK ) { ret = 0; goto out; } } /* ...no tag, merge with previous line */ else { str_empty( &value ); p = process_untagged_line( &value, p ); status = merge_tag_value( risin, &tag, &value, &tag_added ); if ( status!=BIBL_OK ) { ret = 0; goto out; } } } out: strs_free( &tag, &value, NULL ); return ret; } /***************************************************** PUBLIC: int risin_typef() *****************************************************/ static int risin_typef( fields *risin, char *filename, int nref, param *p ) { int ntypename, nrefname, is_default; char *refname = "", *typename = ""; ntypename = fields_find( risin, "TY", LEVEL_MAIN ); nrefname = fields_find( risin, "ID", LEVEL_MAIN ); if ( ntypename!=FIELDS_NOTFOUND ) typename = fields_value( risin, ntypename, FIELDS_CHRP_NOUSE ); if ( nrefname!=FIELDS_NOTFOUND ) refname = fields_value( risin, nrefname, FIELDS_CHRP_NOUSE ); return get_reftype( typename, nref, p->progname, p->all, p->nall, refname, &is_default, REFTYPE_CHATTY ); } /***************************************************** PUBLIC: int risin_convertf() *****************************************************/ static int is_uri_file_scheme( char *p ) { if ( !strncmp( p, "file:", 5 ) ) return 5; return 0; } static int risin_linkedfile( fields *bibin, int n, str *intag, str *invalue, int level, param *pm, char *outtag, fields *bibout ) { int fstatus, m; char *p; /* if URL is file:///path/to/xyz.pdf, only store "///path/to/xyz.pdf" */ m = is_uri_file_scheme( str_cstr( invalue ) ); if ( m ) { /* skip past "file:" and store only actual path */ p = invalue->data + m; fstatus = fields_add( bibout, outtag, p, level ); if ( fstatus==FIELDS_OK ) return BIBL_OK; else return BIBL_ERR_MEMERR; } /* if URL is http:, ftp:, etc. store as a URL */ m = is_uri_remote_scheme( str_cstr( invalue ) ); if ( m!=-1 ) { fstatus = fields_add( bibout, "URL", str_cstr( invalue ), level ); if ( fstatus==FIELDS_OK ) return BIBL_OK; else return BIBL_ERR_MEMERR; } /* badly formed, RIS wants URI, but store value anyway */ fstatus = fields_add( bibout, outtag, str_cstr( invalue ), level ); if ( fstatus==FIELDS_OK ) return BIBL_OK; else return BIBL_ERR_MEMERR; } /* scopus puts DOI in the DO or DI tag, but it needs cleaning */ static int risin_doi( fields *bibin, int n, str *intag, str *invalue, int level, param *pm, char *outtag, fields *bibout ) { int fstatus, doi; doi = is_doi( str_cstr( invalue ) ); if ( doi!=-1 ) { fstatus = fields_add( bibout, "DOI", &(invalue->data[doi]), level ); if ( fstatus!=FIELDS_OK ) return BIBL_ERR_MEMERR; } return BIBL_OK; } static int risin_date( fields *bibin, int n, str *intag, str *invalue, int level, param *pm, char *outtag, fields *bibout ) { char *p = invalue->data; int part, status; str date; part = ( !strncasecmp( outtag, "PART", 4 ) ); str_init( &date ); while ( *p && *p!='/' ) str_addchar( &date, *p++ ); if ( str_memerr( &date ) ) return BIBL_ERR_MEMERR; if ( *p=='/' ) p++; if ( str_has_value( &date ) ) { if ( part ) status = fields_add( bibout, "PARTDATE:YEAR", str_cstr( &date ), level ); else status = fields_add( bibout, "DATE:YEAR", str_cstr( &date ), level ); if ( status!=FIELDS_OK ) return BIBL_ERR_MEMERR; } str_empty( &date ); while ( *p && *p!='/' ) str_addchar( &date, *p++ ); if ( str_memerr( &date ) ) return BIBL_ERR_MEMERR; if ( *p=='/' ) p++; if ( str_has_value( &date ) ) { if ( part ) status = fields_add( bibout, "PARTDATE:MONTH", str_cstr( &date ), level ); else status = fields_add( bibout, "DATE:MONTH", str_cstr( &date ), level ); if ( status!=FIELDS_OK ) return BIBL_ERR_MEMERR; } str_empty( &date ); while ( *p && *p!='/' ) str_addchar( &date, *p++ ); if ( str_memerr( &date ) ) return BIBL_ERR_MEMERR; if ( *p=='/' ) p++; if ( str_has_value( &date ) ) { if ( part ) status = fields_add( bibout, "PARTDATE:DAY", str_cstr( &date ), level ); else status = fields_add( bibout, "DATE:DAY", str_cstr( &date ), level ); if ( status!=FIELDS_OK ) return BIBL_ERR_MEMERR; } str_empty( &date ); while ( *p ) str_addchar( &date, *p++ ); if ( str_memerr( &date ) ) return BIBL_ERR_MEMERR; if ( str_has_value( &date ) ) { if ( part ) status = fields_add( bibout, "PARTDATE:OTHER", str_cstr( &date ), level ); else status = fields_add( bibout, "DATE:OTHER", str_cstr( &date ), level ); if ( status!=FIELDS_OK ) return BIBL_ERR_MEMERR; } str_free( &date ); return BIBL_OK; } static int risin_person( fields *bibin, int n, str *intag, str *invalue, int level, param *pm, char *outtag, fields *bibout ) { int i, begin, end, ok, status = BIBL_OK; slist tokens; str name; str_init( &name ); slist_init( &tokens ); status = slist_tokenize( &tokens, invalue, " \t\r\n", 1 ); if ( status!=SLIST_OK ) { status = BIBL_ERR_MEMERR; goto out; } begin = 0; while ( begin < tokens.n ) { end = begin + 1; while ( end < tokens.n && strcasecmp( slist_cstr( &tokens, end ), "and" ) ) end++; str_empty( &name ); for ( i=begin; ibegin ) str_addchar( &name, ' ' ); str_strcat( &name, slist_str( &tokens, i ) ); } ok = name_add( bibout, outtag, str_cstr( &name ), level, &(pm->asis), &(pm->corps) ); if ( !ok ) { status = BIBL_ERR_MEMERR; goto out; } begin = end + 1; /* Handle repeated 'and' errors */ while ( begin < tokens.n && !strcasecmp( slist_cstr( &tokens, begin ), "and" ) ) begin++; } out: str_free( &name ); slist_free( &tokens ); return status; } /* look for thesis-type hint */ static int risin_thesis_hints( fields *bibin, int reftype, param *p, fields *bibout ) { int i, nfields, fstatus; char *tag, *value; if ( strcasecmp( p->all[reftype].type, "THES" ) ) return BIBL_OK; nfields = fields_num( bibin ); for ( i=0; iverbose && strcmp( tag, "TY" ) ) { if ( p->progname ) fprintf( stderr, "%s: ", p->progname ); fprintf( stderr, "Did not identify RIS tag '%s'\n", tag ); } } static int risin_convertf( fields *bibin, fields *bibout, int reftype, param *p ) { static int (*convertfns[NUM_REFTYPES])(fields *, int, str *, str *, int, param *, char *, fields *) = { [ 0 ... NUM_REFTYPES-1 ] = generic_null, [ SIMPLE ] = generic_simple, [ TITLE ] = generic_title, [ SERIALNO ] = generic_serialno, [ NOTES ] = generic_notes, [ URL ] = generic_url, [ GENRE ] = generic_genre, [ PERSON ] = risin_person, [ DATE ] = risin_date, [ DOI ] = risin_doi, [ LINKEDFILE ] = risin_linkedfile, }; int process, level, i, nfields, status = BIBL_OK; str *intag, *invalue; char *outtag; nfields = fields_num( bibin ); for ( i=0; idata, reftype, p->all, p->nall, &process, &level, &outtag ) ) { risin_report_notag( p, intag->data ); continue; } invalue = fields_value( bibin, i, FIELDS_STRP ); status = convertfns[ process ] ( bibin, i, intag, invalue, level, p, outtag, bibout ); if ( status!=BIBL_OK ) return status; } if ( status == BIBL_OK ) status = risin_thesis_hints( bibin, reftype, p, bibout ); if ( status==BIBL_OK && p->verbose ) fields_report( bibout, stderr ); return status; }