#!/usr/bin/env python # -*- coding: utf-8 -*- ''' becas-python - becas API client for Python ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ **becas-python** is the official Python client for the becas API. **becas** is a biomedical concept annotator available through an HTTP API. This package allows usage of the becas API from a command-line tool or programmatically from Python modules. Install it with:: $ pip install becas And learn to use it by reading the `documentation`_. :copyright: (c) 2014, Tiago Nunes, Universidade de Aveiro :license: Creative Commons Attribution-Noncommercial Resources ^^^^^^^^^ * `Documentation `_ * `Issue Tracker `_ * `Code `_ * `becas API documentation `_ * `About becas `_ ''' __title__ = 'becas' __version__ = '1.0.4-dev' __author__ = 'Tiago Nunes' __license__ = 'CC-BY-NC' __copyright__ = 'Copyright 2014, Tiago Nunes, Universidade de Aveiro' __url__ = 'http://tnunes.github.io/becas-python/' __maintainer__ = 'Tiago Nunes' __email__ = 'tiago.nunes@ua.pt' __all__ = ('email', 'tool', 'timeout', 'secure', 'SEMANTIC_GROUPS', 'EXPORT_FORMATS', 'annotate_text', 'export_text', 'annotate_publication', 'export_publication', 'main', 'BecasException', 'AuthenticationRequired', 'InvalidGroups', 'InvalidFormat', 'TooMuchText', 'TooManyRequests', 'PublicationNotFound', 'ServiceUnavailable', 'ConnectionError', 'SSLError', 'Timeout',) import sys import time import json try: from urllib.parse import quote except ImportError: from urllib import quote # NOQA import requests # urllib2 sucks badly, we depend on requests # -- Configuration parameters - you can set these from your modules ----------- #: becas API authentication ``email`` parameter email = None #: becas API authentication ``tool`` parameter tool = 'becas-python' #: Seconds to wait before timing out a request timeout = 120 #: Whether to use HTTPS or plain HTTP secure = False # -- Internal constants - do not touch these ---------------------------------- #: Semantic groups usable as keys of a ``groups`` :class:`dict` SEMANTIC_GROUPS = ('SPEC', 'ANAT', 'DISO', 'PATH', 'CHED', 'ENZY', 'MRNA', 'PRGE', 'COMP', 'FUNC', 'PROC',) #: Output formats available for the :func:`export_text` function EXPORT_FORMATS = ('json', 'xml', 'a1', 'conll',) _ENDPOINTS_PREFIX = 'bioinformatics.ua.pt/becas/api/' _TEXT_ANNOTATE_ENDPOINT = _ENDPOINTS_PREFIX + 'text/annotate' _TEXT_EXPORT_ENDPOINT = _ENDPOINTS_PREFIX + 'text/export' _PUBMED_ANNOTATE_ENDPOINT = _ENDPOINTS_PREFIX + 'pubmed/annotate/' # + PMID _PUBMED_EXPORT_ENDPOINT = _ENDPOINTS_PREFIX + 'pubmed/export/' # + PMID _DEFAULT_HEADERS = { 'User-Agent': 'becas-python/%s %s' % ( __version__, requests.utils.default_user_agent()), 'Content-Type': 'application/json', } # -- Exceptions --------------------------------------------------------------- class BecasException(RuntimeError): '''There was an ambiguous exception that occurred while handling your request.''' class AuthenticationRequired(BecasException): '''You need to authenticate your requests with your email and optionally a tool name. As an example, if your email address is you@example.com, you can specify it as follows:: >>> import becas >>> becas.email = 'you@example.com' In case of excessive usage of the becas API, UA.PT Bioinformatics will attempt to contact a user at the email address provided before blocking access to the services. ''' class InvalidGroups(BecasException, ValueError): '''You provided an invalid groups dictionary.''' class InvalidFormat(BecasException, ValueError): '''You provided an invalid export format.''' class TooMuchText(BecasException, ValueError): '''You provided too much text to annotate. Try again with less text.''' class TooManyRequests(BecasException): '''You performed too many requests in a short amount of time. Wait ``wait`` seconds before trying again.''' def __init__(self, wait): self.wait = wait class PublicationNotFound(BecasException, ValueError): '''The publication you requested was not found in PubMed.''' class ServiceUnavailable(BecasException): '''The annotation service is currently unavailable.''' class Timeout(BecasException): '''The request timed out.''' class ConnectionError(BecasException): '''A Connection error occurred.''' class SSLError(ConnectionError): '''An SSL error occurred.''' # -- API methods -------------------------------------------------------------- def annotate_text(text, groups=None, echo=False): '''Annotate text with biomedical concepts. :param text: text to annotate (:class:`str` or :class:`unicode`). :param groups: *optional* :class:`dict` of concept groups to identity. :param echo: *optional* flag to return ``text`` in the response. :return: :class:`dict` with annotation results. Usage:: >>> import becas >>> becas.email = 'you@example.com' >>> results = becas.annotate_text('BRCA1 is a human caretaker gene.') ''' _validate_text(text) payload = {'text': text} if groups: _validate_groups(groups) payload['groups'] = groups if echo: payload['echo'] = True _validate_authentication() endpoint = _endpoint_url('annotate_text') response = _do_request(endpoint, payload) return response.json() def export_text(text, format, groups=None): '''Export text annotated with biomedical concepts in JSON, XML, A1 or CONLL. :param text: text to annotate (:class:`str` or :class:`unicode`). :param format: output format (one of 'json', 'xml', 'a1' or 'conll'). :param groups: *optional* :class:`dict` of concept groups to identity. :return: :class:`unicode` string with annotation results. Usage:: >>> import becas >>> becas.email = 'you@example.com' >>> text = 'BRCA1 is a human caretaker gene.' >>> json_results = becas.export_text(text, 'json') >>> iexml_results = becas.export_text(text, 'xml') >>> a1_results = becas.export_text(text, 'a1') >>> conll_results = becas.export_text(text, 'conll') ''' _validate_text(text) _validate_format(format) payload = {'text': text, 'format': format} if groups: _validate_groups(groups) payload['groups'] = groups _validate_authentication() endpoint = _endpoint_url('export_text') response = _do_request(endpoint, payload) return response.text def annotate_publication(pmid, groups=None): '''Annotate PubMed publication with biomedical concepts. :param pmid: PMID of publication to annotate. :param groups: *optional* :class:`dict` of concept groups to identity. :return: :class:`dict` with annotation results. Usage:: >>> import becas >>> becas.email = 'you@example.com' >>> results = becas.annotate_publication(23225384) ''' _validate_pmid(pmid) payload = {} if groups: _validate_groups(groups) payload['groups'] = groups _validate_authentication() endpoint = _endpoint_url('annotate_publication', pmid=pmid) response = _do_request(endpoint, payload) return response.json() def export_publication(pmid, groups=None): '''Export PubMed publication as MEDLINE IeXML annotated with biomedical concepts. :param pmid: PMID of publication to annotate. :param groups: *optional* :class:`dict` of concept groups to identity. :return: :class:`unicode` string with IeXML annotation results. Usage:: >>> import becas >>> becas.email = 'you@example.com' >>> results = becas.export_publication(23225384) ''' _validate_pmid(pmid) payload = {} if groups: _validate_groups(groups) payload['groups'] = groups _validate_authentication() endpoint = _endpoint_url('export_publication', pmid=pmid) response = _do_request(endpoint, payload) return response.text # -- Helpers ------------------------------------------------------------------ def _endpoint_url(endpoint, pmid=None): '''Return service URL for given endpoint.''' scheme = 'https://' if secure else 'http://' auth = '?tool=' + quote(tool) + '&email=' + quote(email) if endpoint == 'annotate_text': return scheme + _TEXT_ANNOTATE_ENDPOINT + auth if endpoint == 'export_text': return scheme + _TEXT_EXPORT_ENDPOINT + auth if endpoint == 'annotate_publication': return scheme + _PUBMED_ANNOTATE_ENDPOINT + str(pmid) + auth if endpoint == 'export_publication': return scheme + _PUBMED_EXPORT_ENDPOINT + str(pmid) + auth raise ValueError('Unknown endpoint "%s"' % endpoint) def _validate_authentication(): '''Ensure the user has authenticated itself by providing an email address and tool name.''' if not email or not email.strip(): raise AuthenticationRequired('Please set your email') if not tool or not tool.strip(): raise AuthenticationRequired('Please set your tool name') def _validate_text(text): '''Validate text to annotate.''' if not text or not text.strip(): raise ValueError('Invalid ``text`` parameter') def _validate_pmid(pmid): '''Validate PMID to annotate''' if not pmid or not isinstance(pmid, int) or pmid <= 0: raise ValueError('Invalid ``pmid`` parameter') def _validate_groups(groups): '''Validate semantic groups.''' if not isinstance(groups, dict): raise InvalidGroups('If specified, ``groups`` must be a dictionary') valid = False for group, value in groups.items(): if group not in SEMANTIC_GROUPS: raise InvalidGroups('Unknown group ``%s``' % group) if type(value) is not bool: raise InvalidGroups( 'Invalid value ``%s`` for group ``%s``. Must be boolean' % (value, group)) if value: valid = True if not valid: # no "true" groups for annotation raise InvalidGroups('No ``groups`` selected for annotation.' ' At least one group must be true') def _validate_format(format): '''Validate export format.''' if format not in EXPORT_FORMATS: raise InvalidFormat('Unknown format ``%s``' % format) def _do_request(endpoint, payload): '''Perform a POST request to one of the becas API endpoints.''' # Throttle requests to perform at most two per second delay = 0.5 current = time.time() wait = _do_request._previous + delay - current if wait > 0: time.sleep(wait) _do_request._previous = current + wait else: _do_request._previous = current try: res = requests.post(endpoint, data=json.dumps(payload), headers=_DEFAULT_HEADERS, timeout=timeout, verify=False) # SSL certificate validation fails # in systems without proper CAs # installed, so we disable # client validation except requests.exceptions.Timeout as e: raise Timeout(e) except requests.exceptions.SSLError as e: raise SSLError(e) except requests.exceptions.ConnectionError as e: raise ConnectionError(e) except Exception as e: raise BecasException(e) try: res.raise_for_status() except requests.exceptions.HTTPError as e: if res.status_code == 404: raise PublicationNotFound(res.json()['error']) if res.status_code == 413: raise TooMuchText(res.json()['error']) if res.status_code == 429: raise TooManyRequests(wait=res.headers['Retry-After']) if res.status_code == 502: raise ServiceUnavailable() if res.status_code == 503: raise ServiceUnavailable(res.json()['error']) raise BecasException(e) return res _do_request._previous = 0 # time of last request # -- Command line interface --------------------------------------------------- def _argparser(): '''Return ArgumentParser to parse command-line options.''' import argparse description = 'Annotate text or PubMed publications using the becas API.' ap = argparse.ArgumentParser(description=description) # API method selection subparsers = ap.add_subparsers() # Text methods text_annotate_parser = subparsers.add_parser( 'annotate-text', help='annotate text as JSON with concept metadata', description=('Annotate text with biomedical concepts using the ' 'becas API.')) text_annotate_parser.set_defaults(func=_cli_annotate_text) text_export_parser = subparsers.add_parser( 'export-text', help='export text in a chosen format', description=('Export text annotated with biomedical concepts in a ' ' chosen format using the becas API.')) text_export_parser.set_defaults(func=_cli_export_text) for text_parser in (text_annotate_parser, text_export_parser): _add_auth_options(text_parser) input_group = text_parser.add_argument_group('input selection') text_input = input_group.add_mutually_exclusive_group(required=True) text_input.add_argument('-f', '--file', type=argparse.FileType('rt'), dest='file', metavar='FILE', help='text file to annotate') text_input.add_argument('-t', '--text', dest='text', metavar='TEXT', help='plain text to annotate') text_input.add_argument('-i', '--stdin', action='store_true', dest='stdin', help='read text from STDIN') output_group = text_export_parser.add_argument_group('output selection') output_group.add_argument('--format', required=True, dest='format', choices=EXPORT_FORMATS, help='output format') # Publication methods publication_annotate_parser = subparsers.add_parser( 'annotate-publication', help='annotate PubMed publication as JSON with concept metadata', description=('Annotate PubMed publications with biomedical concepts ' 'using the becas API.')) publication_annotate_parser.set_defaults(func=_cli_annotate_publication) publication_export_parser = subparsers.add_parser( 'export-publication', help='export PubMed publication in MEDLINE IeXML', description=('Export PubMed publications annotated with biomedical ' 'concepts using the becas API.')) publication_export_parser.set_defaults(func=_cli_export_publication) for publication_parser in ( publication_annotate_parser, publication_export_parser): _add_auth_options(publication_parser) input_group = publication_parser.add_argument_group('input selection') input_group.add_argument('-p', '--pmid', type=int, required=True, dest='pmid', metavar='PMID', help='PMID of publication to annotate') for parser in (text_annotate_parser, text_export_parser, publication_annotate_parser, publication_export_parser): _add_common_options(parser) return ap def _add_auth_options(parser): '''Add API authentication options to a ArgumentParser.''' auth_group = parser.add_argument_group('client authentication') auth_group.add_argument('--email', dest='email', required=True, help='Email address to use in API authentication') auth_group.add_argument('--tool', dest='tool', default=tool, help=('Tool name to use in API authentication ' '(default: %s)' % tool)) def _add_common_options(parser): '''Add common API options to a ArgumentParser.''' import argparse parser.add_argument('-g', '--groups', dest='groups', help=('semantic groups to use for annotation as a ' 'comma separated list (e.g. PRGE,DISO,ANAT). ' 'Available groups: (%s)' % ', '.join(SEMANTIC_GROUPS))) parser.add_argument('-o', '--output-file', type=argparse.FileType('wt'), dest='output_file', metavar='FILE', help='file to save annotation results to') parser.add_argument('--secure', action='store_true', dest='secure', default=secure, help='access the service securely through HTTPS') parser.add_argument('--timeout', type=int, dest='timeout', default=timeout, help='seconds to wait before timing out a request') def _setup_common_cli_args(args): '''Validate and set common command-line arguments.''' global email, tool, timeout, secure email = args.email tool = args.tool timeout = args.timeout secure = args.secure groups = None if args.groups: groups = {} for group in args.groups.split(','): groups[group] = True try: _validate_groups(groups) except InvalidGroups as e: _argparser().error(e) return groups def _validate_cli_text(text, err_msg): '''Validate text input from command-line.''' try: _validate_text(text) except ValueError: _argparser().error(err_msg) def _get_cli_text(args): '''Read text from the chosen input medium.''' if args.stdin: text = sys.stdin.read() _validate_cli_text(text, 'Got no text from STDIN') elif args.file: text = args.file.read() _validate_cli_text(text, '`%s` file is empty or non-textual' % args.file.name) else: text = args.text _validate_cli_text(text, 'Got empty --text argument') return text def _handle_annotation_results(results, output_file): '''Print annotation results to STDOUT or to a file.''' results = json.dumps(results) if isinstance(results, dict) else results if output_file: try: output_file.write(results.encode('utf-8')) except IOError as e: _abort('IOError writing to output file: %s' % e) finally: output_file.close() else: sys.stdout.write(results) sys.stdout.flush() def _cli_annotate_text(args): '''Annotate text from the command-line.''' groups = _setup_common_cli_args(args) text = _get_cli_text(args) try: results = annotate_text(text, groups) except ValueError as e: _argparser().error(e) except BecasException as e: _abort(e) _handle_annotation_results(results, args.output_file) def _cli_export_text(args): '''Export annotated text from the command-line.''' groups = _setup_common_cli_args(args) text = _get_cli_text(args) try: results = export_text(text, args.format, groups) except ValueError as e: _argparser().error(e) except BecasException as e: _abort(e) _handle_annotation_results(results, args.output_file) def _cli_annotate_publication(args): '''Annotate PubMed publication from the command-line.''' groups = _setup_common_cli_args(args) try: results = annotate_publication(args.pmid, groups) except ValueError as e: _argparser().error(e) except BecasException as e: _abort(e) _handle_annotation_results(results, args.output_file) def _cli_export_publication(args): '''Export annotated PubMed publication from the command-line.''' groups = _setup_common_cli_args(args) try: results = export_publication(args.pmid, groups) except ValueError as e: _argparser().error(e) except BecasException as e: _abort(e) _handle_annotation_results(results, args.output_file) def main(): '''Command-line interface entry point.''' args = _argparser().parse_args() args.func(args) def _abort(msg, ret=1): '''Print message to stderr and abort program execution.''' if isinstance(msg, Exception): msg = '%s: %s' % (type(msg).__name__, msg) sys.stderr.write(msg + '\n') sys.exit(ret) if __name__ == '__main__': try: main() except KeyboardInterrupt: _abort('Manually interrupted by ^C. Aborting.')