/* * medin.c * * Copyright (c) Chris Putnam 2004-2018 * * Source code released under the GPL version 2 * */ #include #include #include "is_ws.h" #include "str.h" #include "str_conv.h" #include "fields.h" #include "xml.h" #include "xml_encoding.h" #include "iso639_2.h" #include "bibutils.h" #include "bibformats.h" static int medin_readf( FILE *fp, char *buf, int bufsize, int *bufpos, str *line, str *reference, int *fcharset ); static int medin_processf( fields *medin, char *data, char *filename, long nref, param *p ); /***************************************************** PUBLIC: void medin_initparams() *****************************************************/ void medin_initparams( param *p, const char *progname ) { p->readformat = BIBL_MEDLINEIN; p->charsetin = BIBL_CHARSET_UNICODE; p->charsetin_src = BIBL_SRC_DEFAULT; p->latexin = 0; p->xmlin = 1; p->utf8in = 1; p->nosplittitle = 0; p->verbose = 0; p->addcount = 0; p->output_raw = BIBL_RAW_WITHMAKEREFID | BIBL_RAW_WITHCHARCONVERT; p->readf = medin_readf; p->processf = medin_processf; p->cleanf = NULL; p->typef = NULL; p->convertf = NULL; p->all = NULL; p->nall = 0; slist_init( &(p->asis) ); slist_init( &(p->corps) ); if ( !progname ) p->progname = NULL; else p->progname = strdup( progname ); } /***************************************************** PUBLIC: int medin_readf() *****************************************************/ /* * The only difference between MEDLINE and PUBMED in format is * that the entire library is wrapped in * or tags... */ static char *wrapper[] = { "PubmedArticle", "MedlineCitation" }; static int nwrapper = sizeof( wrapper ) / sizeof( wrapper[0] ); static char * medin_findstartwrapper( char *buf, int *ntype ) { char *startptr=NULL; int i; for ( i=0; idata ) { m = xml_getencoding( line ); if ( m!=CHARSET_UNKNOWN ) file_charset = m; } if ( line->data ) { startptr = medin_findstartwrapper( line->data, &type ); } if ( startptr || inref ) { if ( inref ) str_strcat( &tmp, line ); else { str_strcatc( &tmp, startptr ); inref = 1; } endptr = medin_findendwrapper( str_cstr( &tmp ), type ); if ( endptr ) { str_segcpy( reference, str_cstr( &tmp ), endptr ); haveref = 1; } } } str_free( &tmp ); *fcharset = file_charset; return haveref; } /***************************************************** PUBLIC: int medin_processf() *****************************************************/ typedef struct xml_convert { char *in; /* The input tag */ char *a, *aval; /* The attribute="attribute_value" pair, if nec. */ char *out; /* The output tag */ int level; } xml_convert; static int medin_doconvert( xml *node, fields *info, xml_convert *c, int nc, int *found ) { int i, fstatus; char *d; *found = 0; if ( !xml_has_value( node ) ) return BIBL_OK; d = xml_value_cstr( node ); for ( i=0; iMechanism and..... */ static int medin_articletitle( xml *node, fields *info ) { int fstatus, status = BIBL_OK; if ( xml_has_value( node ) ) { fstatus = fields_add( info, "TITLE", xml_value_cstr( node ), 0 ); if ( fstatus!=FIELDS_OK ) status = BIBL_ERR_MEMERR; } return status; } /* 2003 Jan-Feb */ static int medin_medlinedate( fields *info, char *p, int level ) { int fstatus; str tmp; str_init( &tmp ); p = str_cpytodelim( &tmp, skip_ws( p ), " \t\n\r", 0 ); if ( str_memerr( &tmp ) ) return BIBL_ERR_MEMERR; if ( str_has_value( &tmp ) ) { fstatus = fields_add( info, "PARTDATE:YEAR", str_cstr( &tmp ), level ); if ( fstatus!=FIELDS_OK ) return BIBL_ERR_MEMERR; } p = str_cpytodelim( &tmp, skip_ws( p ), " \t\n\r", 0 ); if ( str_memerr( &tmp ) ) return BIBL_ERR_MEMERR; if ( str_has_value( &tmp ) ) { str_findreplace( &tmp, "-", "/" ); fstatus = fields_add( info, "PARTDATE:MONTH", str_cstr( &tmp ), level ); if ( fstatus!=FIELDS_OK ) return BIBL_ERR_MEMERR; } (void) str_cpytodelim( &tmp, skip_ws( p ), " \t\n\r", 0 ); if ( str_memerr( &tmp ) ) return BIBL_ERR_MEMERR; if ( str_has_value( &tmp ) ) { fstatus = fields_add( info, "PARTDATE:DAY", str_cstr( &tmp ), level ); if ( fstatus!=FIELDS_OK ) return BIBL_ERR_MEMERR; } str_free( &tmp ); return BIBL_OK; } /* eng */ static int medin_language( xml *node, fields *info, int level ) { char *code, *language; int fstatus; code = xml_value_cstr( node ); if ( !code ) return BIBL_OK; language = iso639_2_from_code( code ); if ( language ) fstatus = fields_add( info, "LANGUAGE", language, level ); else fstatus = fields_add( info, "LANGUAGE", code, level ); if ( fstatus==FIELDS_OK ) return BIBL_OK; else return BIBL_ERR_MEMERR; } /* * 0027-8424 * * 100 * 21 * * 2003 * Oct * 14 * * * * * or.... * * * 0735-0414 * * 38 * 1 * * 2003 Jan-Feb * * * Alcohol and alcoholism (Oxford, Oxfordshire) * Alcohol Alcohol. * */ static int medin_journal1( xml *node, fields *info ) { xml_convert c[] = { { "Title", NULL, NULL, "TITLE", 1 }, { "ISOAbbreviation", NULL, NULL, "SHORTTITLE", 1 }, { "ISSN", NULL, NULL, "ISSN", 1 }, { "Volume", NULL, NULL, "VOLUME", 1 }, { "Issue", NULL, NULL, "ISSUE", 1 }, { "Year", NULL, NULL, "PARTDATE:YEAR", 1 }, { "Month", NULL, NULL, "PARTDATE:MONTH", 1 }, { "Day", NULL, NULL, "PARTDATE:DAY", 1 }, }; int nc = sizeof( c ) / sizeof( c[0] ), status, found; if ( xml_has_value( node ) ) { status = medin_doconvert( node, info, c, nc, &found ); if ( status!=BIBL_OK ) return status; if ( !found ) { if ( xml_tag_matches( node, "MedlineDate" ) ) { status = medin_medlinedate( info, xml_value_cstr( node ), 1 ); if ( status!=BIBL_OK ) return status; } if ( xml_tag_matches( node, "Language" ) ) { status = medin_language( node, info, 1 ); if ( status!=BIBL_OK ) return status; } } } if ( node->down ) { status = medin_journal1( node->down, info ); if ( status!=BIBL_OK ) return status; } if ( node->next ) { status = medin_journal1( node->next, info ); if ( status!=BIBL_OK ) return status; } return BIBL_OK; } /* * 12111-6 * */ static int medin_pagination( xml *node, fields *info ) { int i, fstatus, status; str sp, ep; char *p, *pp; if ( xml_tag_matches( node, "MedlinePgn" ) && node->value.len ) { strs_init( &sp, &ep, NULL ); p = str_cpytodelim( &sp, xml_value_cstr( node ), "-", 1 ); if ( str_memerr( &sp ) ) return BIBL_ERR_MEMERR; if ( str_has_value( &sp ) ) { fstatus = fields_add( info, "PAGES:START", str_cstr( &sp ), 1 ); if ( fstatus!=FIELDS_OK ) return BIBL_ERR_MEMERR; } (void) str_cpytodelim( &ep, p, "", 0 ); if ( str_memerr( &ep ) ) return BIBL_ERR_MEMERR; if ( str_has_value( &ep ) ) { if ( sp.len > ep.len ) { for ( i=sp.len-ep.len; idown ) { status = medin_pagination( node->down, info ); if ( status!=BIBL_OK ) return status; } if ( node->next ) { status = medin_pagination( node->next, info ); if ( status!=BIBL_OK ) return status; } return BIBL_OK; } /* * ljwejrelr * */ static int medin_abstract( xml *node, fields *info ) { int fstatus; if ( xml_tag_matches_has_value( node, "AbstractText" ) ) { fstatus = fields_add( info, "ABSTRACT", xml_value_cstr( node ), 0 ); if ( fstatus!=FIELDS_OK ) return BIBL_ERR_MEMERR; } else if ( node->next ) return medin_abstract( node->next, info ); return BIBL_OK; } /* * * Barondeau * David P * ( or David P ) * DP * * * Organization * * */ static int medin_author( xml *node, str *name ) { char *p; if ( xml_tag_matches( node, "LastName" ) ) { if ( str_has_value( name ) ) { str_prepend( name, "|" ); str_prepend( name, xml_value_cstr( node ) ); } else str_strcat( name, xml_value( node ) ); } else if ( xml_tag_matches( node, "ForeName" ) || xml_tag_matches( node, "FirstName" ) ) { p = xml_value_cstr( node ); while ( p && *p ) { if ( str_has_value( name ) ) str_addchar( name, '|' ); while ( *p==' ' ) p++; while ( *p && *p!=' ' ) str_addchar( name, *p++ ); } } else if ( xml_tag_matches( node, "Initials" ) && !strchr( name->data, '|' )) { p = xml_value_cstr( node ); while ( p && *p ) { if ( str_has_value( name ) ) str_addchar( name, '|' ); if ( !is_ws(*p) ) str_addchar( name, *p++ ); } } if ( node->next ) medin_author( node->next, name ); return BIBL_OK; } static int medin_corpauthor( xml *node, str *name ) { if ( xml_tag_matches( node, "CollectiveName" ) ) { str_strcpy( name, xml_value( node ) ); } else if ( node->next ) medin_corpauthor( node->next, name ); return BIBL_OK; } static int medin_authorlist( xml *node, fields *info ) { int fstatus, status; str name; char *tag; str_init( &name ); node = node->down; while ( node ) { if ( xml_tag_matches( node, "Author" ) && node->down ) { status = medin_author( node->down, &name ); tag = "AUTHOR"; if ( str_is_empty( &name ) ) { status = medin_corpauthor( node->down, &name ); tag = "AUTHOR:CORP"; } if ( str_memerr( &name ) || status!=BIBL_OK ) return BIBL_ERR_MEMERR; if ( str_has_value( &name ) ) { fstatus = fields_add(info,tag,name.data,0); if ( fstatus!=FIELDS_OK ) return BIBL_ERR_MEMERR; } str_empty( &name ); } node = node->next; } str_free( &name ); return BIBL_OK; } /* * Journal Article * */ /* * United States * Proc Natl Acad Sci U S A * 7507876 * */ static int medin_journal2( xml *node, fields *info ) { int fstatus, status = BIBL_OK; if ( xml_tag_matches_has_value( node, "MedlineTA" ) && fields_find( info, "TITLE", LEVEL_HOST )==FIELDS_NOTFOUND ) { fstatus = fields_add( info, "TITLE", xml_value_cstr( node ), 1 ); if ( fstatus!=FIELDS_OK ) return BIBL_ERR_MEMERR; } if ( node->down ) { status = medin_journal2( node->down, info ); if ( status!=BIBL_OK ) return status; } if ( node->next ) status = medin_journal2( node->next, info ); return status; } /* Biophysics Crystallography, X-Ray */ static int medin_meshheading( xml *node, fields *info ) { int fstatus, status = BIBL_OK; if ( xml_tag_matches_has_value( node, "DescriptorName" ) ) { fstatus = fields_add( info, "KEYWORD", xml_value_cstr( node ), 0 ); if ( fstatus!=FIELDS_OK ) return BIBL_ERR_MEMERR; } if ( node->next ) status = medin_meshheading( node->next, info ); return status; } static int medin_meshheadinglist( xml *node, fields *info ) { int status = BIBL_OK; if ( xml_tag_matches( node, "MeshHeading" ) && node->down ) { status = medin_meshheading( node->down, info ); if ( status!=BIBL_OK ) return status; } if ( node->next ) status = medin_meshheadinglist( node->next, info ); return status; } /* * .... * * 14523232 * 10.1073/pnas.2133463100 * 2133463100 * PMC4833866 * * * * I think "pii" is "Publisher Item Identifier" */ static int medin_pubmeddata( xml *node, fields *info ) { xml_convert c[] = { { "ArticleId", "IdType", "doi", "DOI", 0 }, { "ArticleId", "IdType", "pubmed", "PMID", 0 }, { "ArticleId", "IdType", "medline", "MEDLINE", 0 }, { "ArticleId", "IdType", "pmc", "PMC", 0 }, { "ArticleId", "IdType", "pii", "PII", 0 }, }; int nc = sizeof( c ) / sizeof( c[0] ), found, status; status = medin_doconvert( node, info, c, nc, &found ); if ( status!=BIBL_OK ) return status; if ( node->next ) { status = medin_pubmeddata( node->next, info ); if ( status!=BIBL_OK ) return status; } if ( node->down ) { medin_pubmeddata( node->down, info ); if ( status!=BIBL_OK ) return status; } return BIBL_OK; } static int medin_article( xml *node, fields *info ) { int fstatus, status = BIBL_OK; if ( xml_tag_matches( node, "Journal" ) ) status = medin_journal1( node, info ); else if ( xml_tag_matches( node, "ArticleTitle" ) ) status = medin_articletitle( node, info ); else if ( xml_tag_matches( node, "Pagination" ) && node->down ) status = medin_pagination( node->down, info ); else if ( xml_tag_matches( node, "Abstract" ) && node->down ) status = medin_abstract( node->down, info ); else if ( xml_tag_matches( node, "AuthorList" ) ) status = medin_authorlist( node, info ); else if ( xml_tag_matches( node, "Language" ) ) status = medin_language( node, info, 0 ); else if ( xml_tag_matches( node, "Affiliation" ) ) { fstatus = fields_add( info, "ADDRESS", xml_value_cstr( node ), 0 ); if ( fstatus!=FIELDS_OK ) status = BIBL_ERR_MEMERR; } if ( status!=BIBL_OK ) return status; if ( node->next ) status = medin_article( node->next, info ); return status; } static int medin_medlinecitation( xml *node, fields *info ) { int fstatus, status = BIBL_OK; if ( xml_tag_matches_has_value( node, "PMID" ) ) { fstatus = fields_add( info, "PMID", xml_value_cstr( node ), 0 ); if ( fstatus!=FIELDS_OK ) return BIBL_ERR_MEMERR; } if ( node->down ) { if ( xml_tag_matches( node, "Article" ) ) { status = medin_article( node->down, info ); } else if ( xml_tag_matches( node, "MedlineJournalInfo" ) ) { status = medin_journal2( node->down, info ); } else if ( xml_tag_matches( node, "MeshHeadingList" ) ) status = medin_meshheadinglist( node->down, info ); if ( status!=BIBL_OK ) return status; } if ( node->next ) status = medin_medlinecitation( node->next, info ); return status; } static int medin_pubmedarticle( xml *node, fields *info ) { int status = BIBL_OK; if ( node->down ) { if ( xml_tag_matches( node, "MedlineCitation" ) ) status = medin_medlinecitation( node->down, info ); else if ( xml_tag_matches( node, "PubmedData" ) ) status = medin_pubmeddata( node->down, info ); if ( status!=BIBL_OK ) return status; } if ( node->next ) status = medin_pubmedarticle( node->next, info ); return status; } static int medin_assembleref( xml *node, fields *info ) { int status = BIBL_OK; if ( node->down ) { if ( xml_tag_matches( node, "PubmedArticle" ) ) status = medin_pubmedarticle( node->down, info ); else if ( xml_tag_matches( node, "MedlineCitation" ) ) status = medin_medlinecitation( node->down, info ); else status = medin_assembleref( node->down, info ); } if ( status!=BIBL_OK ) return status; if ( node->next ) { status = medin_assembleref( node->next, info ); if ( status!=BIBL_OK ) return status; } /* assume everything is a journal article */ if ( fields_num( info ) ) { status = fields_add( info, "RESOURCE", "text", 0 ); if ( status!=FIELDS_OK ) return BIBL_ERR_MEMERR; status = fields_add( info, "ISSUANCE", "continuing", 1 ); if ( status!=FIELDS_OK ) return BIBL_ERR_MEMERR; status = fields_add( info, "GENRE:MARC", "periodical", 1 ); if ( status!=FIELDS_OK ) return BIBL_ERR_MEMERR; status = fields_add( info, "GENRE:BIBUTILS", "academic journal", 1 ); if ( status!=FIELDS_OK ) return BIBL_ERR_MEMERR; status = BIBL_OK; } return status; } static int medin_processf( fields *medin, char *data, char *filename, long nref, param *p ) { int status; xml top; xml_init( &top ); xml_parse( data, &top ); status = medin_assembleref( &top, medin ); xml_free( &top ); if ( status==BIBL_OK ) return 1; return 0; }