#!/bin/bash

# Public domain notice for all NCBI EDirect scripts is located at:
# https://www.ncbi.nlm.nih.gov/books/NBK179288/#chapter6.Public_Domain_Notice

# pmc2info

# Converts PMC article XML to PMCInfo XML for local archive

# efetch -db pmc -id 6260607,9205847 -format xml | pmc2info
#  OR
# tar -xOzf oa_..._xml.PMC...tar.gz --to-stdout | pmc2info

# HERE document for mapping most common title strings
IFS='' read -r -d '' TYPEMAP <<'EOF'
introduction	INTRODUCTION
results	RESULTS
results and discussion	RESULTS_AND_DISCUSSION
discussion	DISCUSSION
conclusions	CONCLUSION
conclusion	CONCLUSION
experimental design	EXPERIMENTAL
experimental section	EXPERIMENTAL
experiment	EXPERIMENTAL
experimental	EXPERIMENTAL
experimental setup	EXPERIMENTAL
experimental procedures	EXPERIMENTAL
materials	MATERIALS
methods	METHODS
materials and methods	MATERIALS_AND_METHODS
material and methods	MATERIALS_AND_METHODS
statistical analysis	DESIGN_OR_ANALYSIS
data analysis	DESIGN_OR_ANALYSIS
study design	DESIGN_OR_ANALYSIS
statistical analyses	DESIGN_OR_ANALYSIS
abbreviations	ABBREVIATIONS
list of abbreviations	ABBREVIATIONS
EOF

xtract -mixed -pattern article -mask "table-wrap,alternatives,inline-formula,disp-formula" -element "*" |
xtract -mixed -set Set -rec Rec -pattern article \
  -division article/front -pkg UI \
    -group article-id -if "@pub-id-type" -equals pmc \
      -branch article-id -if article-id -starts-with PMC -element "article-id[PMC|]" \
      -branch article-id -unless article-id -starts-with PMC -element "article-id" \
  -division article/front -pkg ID \
    -group article-meta \
      -branch article-id -pfx "article-id_" -AIDKY "@pub-id-type" \
        -subset article-id -if article-id -starts-with PMC \
          -tag infon -atr key "&AIDKY" -cls -element "article-id[PMC|]" -end infon \
        -subset article-id -unless article-id -starts-with PMC \
          -tag infon -atr key "&AIDKY" -cls -element article-id -end infon \
  -division article/front -pkg JR \
    -group journal-meta \
      -branch journal-id -pfx "journal_id_" -JIDKY "@journal-id-type" \
        -subset journal-id \
          -tag infon -atr key "&JIDKY" -cls -element journal-id -end infon \
      -branch journal-title \
        -subset journal-title -tag infon -att key journal -cls -plain journal-title -end infon \
  -division article/front -pkg CT \
    -branch article-meta \
      -subset article-meta -if year -tag infon -att key year -cls -min year -end infon \
      -subset volume -tag infon -att key volume -cls -element volume -end infon \
      -subset issue -tag infon -att key issue -cls -element issue -end infon \
      -subset fpage -tag infon -att key fpage -cls -element fpage -end infon \
      -subset lpage -tag infon -att key lpage -cls -element lpage -end infon \
  -division article/front -pkg NM -MIN "()" \
    -group contrib-group/contrib -if "@contrib-type" -equals author \
      -branch name -LEVL "^surname" -MIN -min "&LEVL,&MIN" \
    -group contrib-group/contrib -if "@contrib-type" -equals author \
      -branch name -AUTH surname -GIVN given-names -LEVL "^surname" \
        -section name -if "&LEVL" -eq "&MIN" -pkg AUTH \
          -wrp LastName -author "&AUTH" -wrp Given -author "&GIVN" \
          -wrp Initials -initials "&GIVN" \
  -division article/front -pkg KY \
    -branch article-meta -sep " " --KYWDS kwd \
      -subset article-meta -tag infon -att key kwd -cls -plain "&KYWDS" -end infon \
  -division article/front -pkg LI \
    -branch permissions/license \
      -subset license-p -tag infon -att key license -cls -prose license-p -end infon \
  -division article/front \
    -branch title-group \
      -section "*" -pkg TI \
        -subset article-title -wrp text -prose article-title \
  -division article/front \
    -group "abstract/*" \
      -section "*" -pkg AB \
        -subset p -wrp text -prose p \
  -division body -pkg BD -MIN "()" \
    -group "sec/*" \
      -branch sec/title -LEVL "^title" -MIN -min "&LEVL,&MIN" \
    -group "sec/*" \
      -branch sec/title -LEVL "^title" \
        -section title -if "&LEVL" -eq "&MIN" -pkg passage \
          -subset title -wrp title -alpha title \
      -branch sec/p \
        -section p -pkg passage \
          -subset p -wrp text -prose p |
xtract -mixed -set Set -rec Rec -pattern Rec \
  -UID UI -wrp UID -element "&UID" \
  -division Rec -pkg CX \
    -block Rec -wrp UID -element "&UID" \
    -block infon -if "@key" -equals "article-id_doi" -wrp DOI -element infon \
    -block infon -if "@key" -equals "article-id_pmc" -wrp PMCID -element infon \
    -block infon -if "@key" -equals "article-id_pmid" -wrp PMID -element infon \
    -block infon -if "@key" -equals "article-id_publisher-id" -wrp PUBID -element infon \
    -block infon -if "@key" -equals "elocation-id" -wrp ELOCID -element infon \
    -block infon -if "@key" -equals "source" -wrp SRC -element infon \
    -block infon -if "@key" -equals "journal_id_nlm-ta" -wrp JOUR -element infon \
    -block infon -if "@key" -equals "volume" -wrp VOL -element infon \
    -block infon -if "@key" -equals "issue" -wrp ISS -element infon \
    -block infon -if "@key" -equals "year" -wrp YEAR -year infon \
  -division Rec -pkg AX \
    -block AUTH -element "*" \
  -division Rec -pkg KL \
    -block KY -wrp KEYWORDS -element infon \
    -block LI -wrp LICENSE -element infon \
  -division TI -enc TITLE \
      -tag TEXT -att type TITLE -cls -encode text -end TEXT -rst -clr \
  -division AB -enc ABSTRACT \
      -tag TEXT -att type ABSTRACT -cls -encode text -end TEXT -rst -clr \
  -division passage \
    -group passage -if title \
      -section passage -enc passage \
        -wrp title -lower title \
    -group passage -if text \
      -section passage -enc passage \
        -wrp text -element text |
xtract -mixed -set Set -rec Rec -pattern Rec \
  -ALVL ALVL -TLVL TLVL -wrp UID -element Rec/UID \
  -division Rec -pkg CIT \
    -group "CX/*" -element "*" \
    -group "AX/*" \
      -block AUTH -pkg AUTH \
        -subset AUTH -wrp LastName -element LastName \
          -wrp Given -element Given -wrp Initials -element Initials \
    -group "KL/*" -element "*" \
  -division TITLE -pkg TITLE -block TITLE -wrp TEXT -element TEXT \
  -division ABSTRACT -group ABSTRACT -pkg ABSTRACT -block ABSTRACT -wrp TEXT -element TEXT \
  -division passage \
    -group passage -if title \
      -section passage -enc passage \
        -wrp title -lower title \
    -group passage -if text \
      -section passage -enc passage \
        -wrp text -element text |
xtract -mixed -transform <( echo -e "$TYPEMAP" ) -set Set -rec Rec -pattern Rec \
  -wrp UID -element Rec/UID \
  -division Rec -pkg CIT \
    -group "CIT/*" -element "*" \
  -division TITLE -pkg TITLE -block TITLE -wrp TEXT -element TEXT \
  -division ABSTRACT -group ABSTRACT -pkg ABSTRACT -block ABSTRACT -wrp TEXT -element TEXT \
  -division passage \
    -group passage -if title \
      -section passage -enc passage \
        -wrp title -translate title -wrp default -lbl PARAGRAPH \
    -group passage -if text \
      -section passage -enc passage \
        -wrp text -element text |
xtract -mixed -rec PMCInfo -pattern Rec -SECT "(PARAGRAPH)" \
  -wrp UID -element Rec/UID \
  -division Rec -pkg CIT \
    -group "CIT/*" -element "*" \
  -division TITLE -pkg TITLE -block TITLE -wrp TEXT -element TEXT \
  -division ABSTRACT -group ABSTRACT -pkg ABSTRACT -block ABSTRACT -wrp TEXT -element TEXT \
  -division passage \
    -group passage -if default -SECT default \
    -group passage -if title -SECT title \
    -group passage -if text \
      -section passage -enc "&SECT" \
        -wrp TEXT -element text |
transmute -mixed -format
