Newer
Older
"""A script for generating DataCite DOI's for Mantid releases, to be called by
a Jenkins job during the release process. When given a major, minor and patch
release number along with username and password credentials, it will build a
DOI of the form "10.5286/Software/Mantid[major].[minor].[patch]", and post it
to the DataCite DOI API.
A special one-time "main" landing page DOI will be created using:
python doi.py --major=3 --minor=0 --patch=0 --username=[] --password=[] --main
Then at every release, the script will run again without the "--main" flag to
generate a DOI pointing to the release notes for that particular version.
Using the "--test" flag will run the script and post DOI's to the DataCite test
server at https://test.datacite.org/mds/doi/10.5286/Software/.
Using the "--debug" flag should print out some (hopefully) useful extra info
about what is going on under the hood.
Using the "--delete" flag will the DOI metadata with the given details
"inactive", as well as pointing the DOI to a "DOI invalid" page.
NOTES:
- This script is tailored for use at RAL, where the internet must be accessed
via the "wwwcache.rl.ac.uk" proxy. Making it work off site will require some
tweaking.
- A requirement for the script to run is for cURL to be installed and its
executable to be on the PATH.
- The "www.mantidproject.org" domain had to be registered with DataCite (on
both the test server and the main server) before a valid DOI could be
created. This was done through the British Library, via Tom Griffin.
- Mantid DOIs will be "linked" using the relationship identifiers available in
the metadata schema. Each incremental-release DOI will be linked to the
previous DOI using the "IsNextVersionOf" field. The metadata for the
previous DOI will then have to be changed to include a "IsPreviousVersionOf"
field. Each incremental-release DOI will also be linked to the "main" Mantid
DOI via a "IsPartOf" field. The main DOI itself will have no relationship
identifiers.
USEFUL LINKS:
- The DataCite DOI API documentation can be found at:
https://mds.datacite.org/static/apidoc
- Example Python code for submitting DOI's and metadata:
https://github.com/datacite/mds/blob/master/client/python/put_doi.py
https://github.com/datacite/mds/blob/master/client/python/put_metadata.py
- HTTP status codes:
http://docs.python.org/2/library/httplib.html#httplib.HTTPS_PORT
"""
import argparse
import xml.etree.ElementTree as ET
import subprocess
from datetime import date
import authors
# Successful responses from the DataCite servers appear to only come in one of
# two forms:
# - 'OK'
# - 'OK ([DOI])'
SUCCESS_RESPONSE = '^OK( \((.+)\))?$'
# Point all "deleted" DOIs to here:
INVALID_URL = 'http://www.datacite.org/invalidDOI'
def build_xml_form(doi, relationships, creator_name_list, version_str):
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
'''Builds the xml form containing the metadata for the DOI. Where helpful,
comments showing the definition / allowed values of the data fields have
been taken from section 2.3 of:
http://schema.datacite.org/meta/kernel-3/doc/DataCite-MetadataKernel_v3.0.pdf
The decision has been made to not use the optional "contributors" field,
since creators works just as well and is mandatory anyway.
'''
# The root resource node must contain the various schema information.
root = ET.Element('resource')
root.set('xmlns', 'http://datacite.org/schema/kernel-3')
root.set('xmlns:xsi', 'http://www.w3.org/2001/XMLSchema-instance')
root.set('xsi:schemaLocation', 'http://datacite.org/schema/kernel-3 ht' + \
'tp://schema.datacite.org/meta/kernel-3' + \
'/metadata.xsd')
# "The identifier is a unique string that identifies a resource." In our
# case, the actual DOI. "Format should be '10.1234/foo'."
identifier = ET.SubElement(root, 'identifier')
identifier.text = doi
identifier.set('identifierType', 'DOI')
# Creators are defined as "the main researchers involved in producing the
# data, or the authors of the publication, in priority order". Allowed
# values are "a corporate/institutional or personal name".
#
# Use all authors up to and including the version tag if creating the
# "main" DOI, else only use authors who contributed to that version.
creators = ET.SubElement(root, 'creators')
for creator_name in creator_name_list:
creator = ET.SubElement(creators, 'creator')
ET.SubElement(creator, 'creatorName').text = creator_name
# Titles are defined as a "name or title by which a resource is known".
title_text_list = 'Mantid: Manipulation and Analysis Toolkit for ' + \
'Instrument Data.',
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
titles = ET.SubElement(root, 'titles')
for title_text in title_text_list:
ET.SubElement(titles, 'title').text = title_text
# "The name of the entity that holds, archives, publishes, prints,
# distributes, releases, issues, or produces the resource. This property
# will be used to formulate the citation, so consider the prominence of
# the role."
ET.SubElement(root, 'publisher').text = 'Mantid Project'
# "The year when the data was or will be made publicly available."
ET.SubElement(root, 'publicationYear').text = str(date.today().year)
# "Subject, keyword, classification code, or key phrase describing the
# resource."
subject_text_list = [
'Neutron Scattering',
'Muon Spin Resonance',
'Data Analysis'
]
subjects = ET.SubElement(root, 'subjects')
for subject_text in subject_text_list:
ET.SubElement(subjects, 'subject').text = subject_text
# "The primary language of the resource."
ET.SubElement(root, 'language').text = 'en'
# "A description of the resource. The format is open, but the
# preferred format is a single term of some detail so that a pair can be
# formed with the sub-property." Just using the general type "software"
# seems good enough for our purposes.
resource_type = ET.SubElement(root, 'resourceType')
resource_type.text = ''
resource_type.set('resourceTypeGeneral', 'Software')
# "The version number of the resource." Suggested practice is to "register
# a new identifier for a major version change." We'll be ignoring this
# as we're having a new DOI for every major/minor/patch release.
if version_str:
ET.SubElement(root, 'version').text = version_str
# "Identifiers of related resources. These must be globally unique
# identifiers."
if relationships:
related_identifiers = ET.SubElement(root, 'relatedIdentifiers')
for doi, relation_type in relationships.items():
related_identifier = ET.SubElement(
related_identifiers, 'relatedIdentifier'
)
related_identifier.text = doi
related_identifier.set('relatedIdentifierType', 'DOI')
related_identifier.set('relationType', relation_type)
# "Provide a rights management statement for the resource or reference a
# service providing such information. Include embargo information if
# applicable. Use the complete title of a license and include version
# information if applicable."
rights_list = ET.SubElement(root, 'rightsList')
rights = ET.SubElement(rights_list, 'rights')
rights.text = 'GNU General Public Release (Version 3)'
rights.set('rightsURI', 'http://www.gnu.org/licenses/gpl.html')
# "All additional information that does not fit in any of the other
# categories. May be used for technical information."
descriptions = ET.SubElement(root, 'descriptions')
description = ET.SubElement(descriptions, 'description')
description.text = 'Mantid: A high performance framework for the ' + \
'reduction and analysis of muon spin resonance and ' + \
'neutron scattering data.'
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
description.set('descriptionType', 'Abstract')
return ET.tostring(root, encoding='utf-8')
def _http_request(body, method, url, options):
'''Issue an HTTP request with the given options.
We are forced to use a command line tool for this rather than use the
in-built Python libraries since httplib, urllib and urllib2 all seem to
have problems using HTTPS through the proxy at RAL. HTTP works fine,
but the DOI API is encrypted so that is not an option.
We prefer cURL to wget since it exists on many Linux machines and even
comes bundled with Git Bash for Windows! Some good info on scripting
with cURL can be found at:
http://curl.haxx.se/docs/httpscripting.html'''
args = [
'curl',
'--user', options.username + ':' + options.password,
'--proxy', 'wwwcache.rl.ac.uk:8080',
'--header', 'Content-Type:text/plain;charset=UTF-8',
# The bodies of HTTP messages must be encoded:
'--data', body.encode('utf-8'),
'--request', method,
]
# Set how loud cURL should be while running.
if options.debug: args.append('--verbose')
else: args.append('--silent')
args.append(url)
proc = subprocess.Popen(args,stdout=subprocess.PIPE)
result = proc.stdout.readlines()
print "Server Response: " + str(result)
return result
def delete_doi(base, doi, options):
'''Will attempt to delete the given DOI. Note that this does not actually
remove the DOI from the DataCite servers; it makes its metadata "inactive"
and points the DOI to a "DOI invalid" page.
print "\nAttempting to delete the meta data for:" + doi
result = _http_request(
body = '',
method = 'DELETE',
url = base + "metadata/" + doi,
options = options
)
if not re.match(SUCCESS_RESPONSE, result[0]):
raise Exception('Deleting metadata unsuccessful. Quitting.')
print "\nAttempting to point " + doi + " to invalid page."
result = _http_request(
body = 'doi=' + doi + '\n' + 'url=' + INVALID_URL,
method = "PUT",
url = base + "doi/" + doi,
options = options
)
if not re.match(SUCCESS_RESPONSE, result[0]):
raise Exception('Pointing DOI to invalid page was unsuccessful.')
def create_or_update_metadata(xml_form, base, doi, options):
'''Attempts to create some new metadata for the doi of the given address.
Metadata must be created before a doi can be created. If the metadata
already exists, then it will simply be updated.
'''
print "\nAttempting to create / update metadata:"
body = xml_form,
method = "PUT",
url = base + "metadata/" + doi,
options = options
)
if not re.match(SUCCESS_RESPONSE, result[0]):
raise Exception('Creation/updating metadata unsuccessful. Quitting.')
def create_or_update_doi(base, doi, destination, options):
'''Attempts to create a new DOI of the given address. Metadata must be
created before this can be successful. If the doi already exists, then it
will simply be updated.
'''
print "\nAttempting to create / update the following DOI:"
print 'DOI = ' + doi
print 'URL = ' + destination
body = 'doi=' + doi + '\n' + 'url=' + destination,
method = "PUT",
url = base + "doi/" + doi,
options = options
)
if not re.match(SUCCESS_RESPONSE, result[0]):
raise Exception('Creation/updating DOI unsuccessful. Quitting.')
def check_if_doi_exists(base, doi, destination, options):
'''Attempts to check if the given doi exists by querying the server and
seeing if what comes back is the expected DOI destination. Returns True
if a doi is found (and the destination returned by the server is the same
as the given destination), else false. Throws if the response from the
server is unrecognised, or if there is no response at all.
'''
print "\nChecking if \"" + base + "doi/" + doi + "\" DOI exists."
result = _http_request(
body = '',
method = 'GET',
url = base + "doi/" + doi,
options = options
)
if result[0] == 'DOI not found' or result[0] == INVALID_URL:
print "\"" + doi + "\" does not exist"
return False
elif result[0] == destination:
print "DOI found."
return True
else:
raise Exception(
"Unexpected result back from server: \"" + result[0] + "\"")
'''A check to see whether we can call cURL on the command line.
# See if a call to 'curl --version' gives us a successful return code,
# else raise an exception.
try:
proc = subprocess.Popen(['curl', '--version'],stdout=subprocess.PIPE)
proc.wait()
if proc.returncode == 0:
found = True
else:
found = False
except:
found = False
if not found:
raise Exception('This script requires that cURL be installed and ' + \
'available on the PATH.')
def run(options):
'''Creating a usable DOI is (for our purposes at least) a two step
process: metadata has to be constructed and then sent to the server, and
then the DOI itself has to be sent once the metadata is in place.
If pre-existing DOI's or metadata are submitted then they will overwrite
what was there previously.
We also have to amend the metadata for the previous DOI (if one exists),
so that we can set up a IsPreviousVersionOf/IsNewVersionOf relationship
between the two DOIs.
# Get the git tag and "version string" of this version as well as the
# version before it if this is an incremental release.
version = options.major, options.minor, options.patch
version_str = authors.get_version_string(options.major,
options.minor,
options.patch)
tag = authors.find_tag(*version)
if not options.main:
prev_tag = authors.get_previous_tag(tag)
prev_version = authors.get_version_from_git_tag(prev_tag)
prev_version_str = authors.get_version_string(*prev_version)
main_doi = '10.5286/Software/Mantid'
doi = main_doi
prev_doi = ''
has_previous_version = False
else: # Incremental release DOI.
prev_doi = '10.5286/Software/Mantid' + prev_version_str
doi = '10.5286/Software/Mantid' + version_str
if options.main:
destination = 'http://www.mantidproject.org'
else:
destination = 'http://www.mantidproject.org/Release_Notes_' + \
version_str
prev_destination = 'http://www.mantidproject.org/Release_Notes_' +\
prev_version_str
# Use the test server if running in test mode.
if options.test:
server_url_base = 'https://test.datacite.org/mds/'
else:
server_url_base = 'https://mds.datacite.org/'
if options.delete:
delete_doi(server_url_base, doi, options)
# If the user ran this script with the --main flag, then all we need to do
# is create a single, unlinked DOI to the main project page.
creator_name_list = authors.authors_up_to_git_tag(tag)
# In the case of the main DOI we need to add the whitelisted names too.
creator_name_list = sorted(set(creator_name_list + authors.whitelist))
xml_form = build_xml_form(doi, {}, creator_name_list, None)
create_or_update_metadata(xml_form, server_url_base, doi, options)
create_or_update_doi(server_url_base, doi, destination, options)
# Else it's an incremental-release DOI that we need to make.
has_previous_version = check_if_doi_exists(
server_url_base,
prev_doi,
prev_destination,
options
)
relationships = { main_doi : 'IsPartOf' }
if has_previous_version:
relationships[prev_doi] = 'IsNewVersionOf'
creator_name_list = authors.authors_under_git_tag(tag)
xml_form = build_xml_form(
doi,
relationships,
creator_name_list,
version_str
)
# Create/update the metadata and DOI.
create_or_update_metadata(xml_form, server_url_base, doi, options)
create_or_update_doi(server_url_base, doi, destination, options)
# Create/update the metadata and DOI of the previous version, if it
# was found to have a DOI.
if has_previous_version:
prev_relationships = {
main_doi : 'IsPartOf',
doi : 'IsPreviousVersionOf'
}
prev_creator_name_list = authors.authors_under_git_tag(prev_tag)
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
prev_xml_form = build_xml_form(
prev_doi,
prev_relationships,
prev_creator_name_list,
prev_version_str
)
create_or_update_metadata(
prev_xml_form,
server_url_base,
prev_doi,
options
)
# Print out a custom success message, depending on the initial options.
if not options.test:
method = "resolved"
doi_add = 'http://dx.doi.org/' + doi
meta_add = 'https://mds.datacite.org/metadata/' + doi
prev_meta_add = 'https://mds.datacite.org/metadata/' + prev_doi
else:
method = "inspected"
doi_add = 'https://test.datacite.org/mds/doi/' + doi
meta_add = 'https://test.datacite.org/mds/metadata/' + doi
prev_meta_add = 'https://test.datacite.org/mds/metadata/' + prev_doi
if has_previous_version:
message = "\nSUCCESS!" + \
"\nThe DOI can be %s at \"%s\"." % (method, doi_add) + \
"\nThe new metadata can be inspected at \"%s\"." % (meta_add) + \
"\nThe previous version's metadata can be inspected at" + \
"\"%s\"." % (prev_meta_add)
else:
message = "\nSUCCESS!" + \
"\nThe DOI can be %s at \"%s\"." % (method, doi_add) + \
"\nThe metadata can be inspected at \"%s\"." % (meta_add)
print message
if __name__ == "__main__":
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
parser = argparse.ArgumentParser(
description="Script to generate the DOI needed for a Mantid release."
)
# REQUIRED
parser.add_argument(
'--major',
type=int,
required=True,
help='The major version number of this Mantid release.'
)
parser.add_argument(
'--minor',
type=int,
required=True,
help='The minor version number of this Mantid release.'
)
parser.add_argument(
'--patch',
type=int,
required=True,
help='The patch version number of this Mantid release. Note: this' + \
' is NOT the SHA1 or the commit number from Git. The patch ' + \
'number for v3.0 (or v3.0.0) is 0, and the patch number for ' + \
'v3.0.1 is 1.'
)
parser.add_argument(
'--password',
type=str,
required=True,
help='Password. This should be hidden in the Jenkins\' job logs.'
)
parser.add_argument(
'--username',
type=str,
required=True,
help='Username to access DOI API.'
)
# OPTIONAL
parser.add_argument(
'--test',
action='store_true',
help='Send submissions to the test server to trial run the script.'
)
parser.add_argument(
'--debug',
action='store_true',
help='Turn debug mode on. Basically, makes cURL more talkative.'
)
parser.add_argument(
'--main',
action='store_true',
help='Create the "main" DOI for Mantid. Once it is created, this ' + \
'will only have to run again if it needs to be updated.'
)
parser.add_argument(
'--delete',
action='store_true',
help='Delete ("make inactive") the DOI metadata with the given ' + \
'details. Note that this does NOT delete the DOI.'
)
run(parser.parse_args())