root/tools/isis2json.py

Revision 2543, 10.3 kB (checked in by luciano.ramalho, 9 years ago)

added moved to GitHub? warning to isis2json.py

  • Property svn:executable set to *
Line 
1#!/usr/bin/env python
2# -*- encoding: utf-8 -*-
3#################################################################
4#   
5#     __  __  ______      ________ _____    _______ ____ 
6#    |  \/  |/ __ \ \    / /  ____|  __ \  |__   __/ __ \
7#    | \  / | |  | \ \  / /| |__  | |  | |    | | | |  | |
8#    | |\/| | |  | |\ \/ / |  __| | |  | |    | | | |  | |
9#    | |  | | |__| | \  /  | |____| |__| |    | | | |__| |
10#    |_|  |_|\____/   \/   |______|_____/     |_|  \____/
11#                                                     
12#         _____ _____ _______ _    _ _    _ ____ 
13#        / ____|_   _|__   __| |  | | |  | |  _ \
14#       | |  __  | |    | |  | |__| | |  | | |_) |
15#       | | |_ | | |    | |  |  __  | |  | |  _ <
16#       | |__| |_| |_   | |  | |  | | |__| | |_) |
17#        \_____|_____|  |_|  |_|  |_|\____/|____/
18#
19#  FROM 2011-07-16 THE OFFICIAL REPOSITORY OF isis2json IS:
20#
21#         https://github.com/bireme/isis2json
22#
23#################################################################
24#
25# isis2json.py: convert ISIS and ISO-2709 files to JSON
26#
27# Copyright (C) 2010 BIREME/PAHO/WHO
28#
29# This program is free software: you can redistribute it and/or modify
30# it under the terms of the GNU Lesser General Public License as published
31# by the Free Software Foundation, either version 2.1 of the License, or
32# (at your option) any later version.
33
34# This program is distributed in the hope that it will be useful,
35# but WITHOUT ANY WARRANTY; without even the implied warranty of
36# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
37# GNU Lesser General Public License for more details.
38
39# You should have received a copy of the GNU Lesser General Public License
40# along with this program. If not, see <http://www.gnu.org/licenses/>.
41
42############################
43# this script works with Python or Jython (versions >=2.5 and <3)
44
45import sys
46import argparse
47from uuid import uuid4
48import os
49
50try:
51    import json
52except ImportError:
53    if os.name == 'java': # running Jython
54        from com.xhaus.jyson import JysonCodec as json
55    else:
56        import simplejson as json
57
58SKIP_INACTIVE = True
59DEFAULT_QTY = 2**31
60ISIS_MFN_KEY = 'mfn'
61ISIS_ACTIVE_KEY = 'active'
62SUBFIELD_DELIMITER = '^'
63INPUT_ENCODING = 'cp1252'
64
65def iterMstRecords(master_file_name, subfields):
66    try:
67        from br.bireme.zeus.master import MasterFactory, Record
68    except ImportError:
69        print('IMPORT ERROR: Jython 2.5 and zeusIII.jar are required to parse .mst files')
70        raise SystemExit
71    mst = MasterFactory.getInstance(master_file_name).getMaster().open()
72    for record in mst:
73        fields = {}
74        if SKIP_INACTIVE:
75            if record.getStatus() != Record.Status.ACTIVE:
76                continue
77        else: # save status only there are non-active records
78            fields[ISIS_ACTIVE_KEY] = record.getStatus() == Record.Status.ACTIVE
79        fields[ISIS_MFN_KEY] = record.getMfn()
80        for field in record.getFields():
81            field_key = str(field.getId())
82            field_occurrences = fields.setdefault(field_key,[])
83            if subfields:
84                content = {}
85                for subfield in field.getSubfields():
86                    subfield_key = subfield.getId()
87                    if subfield_key == '*':
88                        content['_'] = subfield.getContent()
89                    else:
90                        subfield_occurrences = content.setdefault(subfield_key,[])
91                        subfield_occurrences.append(subfield.getContent())
92                field_occurrences.append(content)
93            else:
94                content = []
95                for subfield in field.getSubfields():
96                    subfield_key = subfield.getId()
97                    if subfield_key == '*':
98                        content.insert(0, subfield.getContent())
99                    else:
100                        content.append(SUBFIELD_DELIMITER+subfield_key+
101                                       subfield.getContent())
102                field_occurrences.append(''.join(content))
103        yield fields
104    mst.close()
105
106def iterIsoRecords(iso_file_name, subfields):
107    from iso2709 import IsoFile
108    def parse(field):
109        content = field.value.decode(INPUT_ENCODING,'replace')
110        parts = content.split(SUBFIELD_DELIMITER)
111        subs = {}
112        main = parts.pop(0)
113        if len(main) > 0:
114            subs['_'] = main
115        for part in parts:
116            prefix = part[0]
117            subs[prefix] = part[1:]
118        return subs
119
120    iso = IsoFile(iso_file_name)
121    for record in iso:
122        fields = {}
123        for field in record.directory:
124            field_key = str(int(field.tag)) # remove leading zeroes
125            field_occurrences = fields.setdefault(field_key,[])
126            if subfields:
127                field_occurrences.append(parse(field))
128            else:
129                field_occurrences.append(field.value.decode(INPUT_ENCODING,'replace'))
130
131        yield fields
132    iso.close()
133
134def writeJsonArray(iterRecords, file_name, output, qty, skip, id_tag,
135                   gen_uuid, mongo, mfn, subfields, tagprefix, regtype):
136    start = skip
137    end = start + qty
138    if not mongo:
139        output.write('[')
140    if id_tag:
141        id_tag = str(id_tag)
142        ids = set()
143    else:
144        id_tag = ''
145    for i, record in enumerate(iterRecords(file_name, subfields)):
146        if i >= end:
147            break
148        if i > start and not mongo:
149            output.write(',')
150        output.write('\n')
151        if start <= i < end:
152            if id_tag:
153                occurrences = record.get(id_tag, None)
154                if occurrences is None:
155                    msg = 'id tag #%s not found in record %s'
156                    if ISIS_MFN_KEY in record:
157                        msg = msg + (' (mfn=%s)' % record[ISIS_MFN_KEY])
158                    raise KeyError(msg % (id_tag, i))
159                if len(occurrences) > 1:
160                    msg = 'multiple id tags #%s found in record %s'
161                    if ISIS_MFN_KEY in record:
162                        msg = msg + (' (mfn=%s)' % record[ISIS_MFN_KEY])
163                    raise TypeError(msg % (id_tag, i))
164                else:
165                    if subfields:
166                        id = occurrences[0]['_']
167                    else:
168                        id = occurrences[0]
169                    if id in ids:
170                        msg = 'duplicate id %s in tag #%s, record %s'
171                        if ISIS_MFN_KEY in record:
172                            msg = msg + (' (mfn=%s)' % record[ISIS_MFN_KEY])
173                        raise TypeError(msg % (id, id_tag, i))
174                    record['_id'] = id
175                    ids.add(id)
176            elif gen_uuid:
177                record['_id'] = unicode(uuid4())
178            elif mfn:
179                record['_id'] = record[ISIS_MFN_KEY]
180            if tagprefix:
181                for tag in record:
182                    if str(tag).isdigit():
183                        record[tagprefix+tag] = record[tag]
184                        del record[tag]
185            if regtype:             
186                record[regtype.split(':')[0]] = regtype.split(':')[1]
187            output.write(json.dumps(record).encode('utf-8'))
188    if not mongo:
189        output.write('\n]')
190    output.write('\n')
191
192if __name__ == '__main__':
193
194    # create the parser
195    parser = argparse.ArgumentParser(
196        description='Convert an ISIS .mst or .iso file to a JSON array')
197
198    # add the arguments
199    parser.add_argument(
200        'file_name', metavar='INPUT.(mst|iso)', help='.mst or .iso file to read')
201    parser.add_argument(
202        '-o', '--out', type=argparse.FileType('w'), default=sys.stdout,
203        metavar='OUTPUT.json',
204        help='the file where the JSON output should be written'
205             ' (default: write to stdout)')
206    parser.add_argument(
207        '-c', '--couch', action='store_true',
208        help='output array within a "docs" item in a JSON document'
209             ' for bulk insert to CouchDB via POST to db/_bulk_docs')
210    parser.add_argument(
211        '-m', '--mongo', action='store_true',
212        help='output individual records as separate JSON dictionaries,'
213             ' one per line for bulk insert to MongoDB via mongoimport utility')
214    parser.add_argument(
215        '-f', '--subfields', action='store_true',
216        help='explode each field into a JSON dictionary, with "_" as'
217             ' default key, and subfield markers as additional keys')
218    parser.add_argument(
219        '-q', '--qty', type=int, default=DEFAULT_QTY,
220        help='maximum quantity of records to read (default=ALL)')
221    parser.add_argument(
222        '-s', '--skip', type=int, default=0,
223        help='records to skip from start of .mst (default=0)')
224    parser.add_argument(
225        '-i', '--id', type=int, metavar='TAG_NUMBER', default=0,
226        help='generate an "_id" from the given unique TAG field number'
227             ' for each record')
228    parser.add_argument(
229        '-u', '--uuid', action='store_true',
230        help='generate an "_id" with a random UUID for each record')
231    parser.add_argument(
232        '-t', '--tagprefix', type=str, metavar='PREFIX', default='',
233        help='concatenate prefix to numeric field tags (ex. 99 becomes "v99"')
234    parser.add_argument(
235        '-n', '--mfn', action='store_true',
236        help='generate an "_id" from the MFN of each record'
237             ' (available only for .mst input)')
238    parser.add_argument(
239        '-y', '--regtype', type=str, default='',
240        help='Include a field key:value for each register: -r key:value')
241
242    '''
243    # TODO: implement this to export large quantities of records to CouchDB
244    parser.add_argument(
245        '-r', '--repeat', type=int, default=1,
246        help='repeat operation, saving multiple JSON files'
247             ' (default=1, use -r 0 to repeat until end of input)')
248    '''
249    # parse the command line
250    args = parser.parse_args()
251    if args.file_name.endswith('.mst'):
252        iterRecords = iterMstRecords
253    else:
254        if args.mfn:
255            print('UNSUPORTED: -n/--mfn option only available for .mst input.')
256            raise SystemExit
257        iterRecords = iterIsoRecords
258    if args.couch:
259        args.out.write('{ "docs" : ')
260    writeJsonArray(iterRecords, args.file_name, args.out, args.qty, args.skip,
261        args.id, args.uuid, args.mongo, args.mfn, args.subfields, args.tagprefix, args.regtype)
262    if args.couch:
263        args.out.write('}\n')
264    args.out.close()
265
Note: See TracBrowser for help on using the browser.