forked from ISTI-ansible-roles/ansible-roles
ckan -> library/roles/ckan
postgresql_extensions -> library/roles/postgresql_extensions d4science-ghn-cluster: The ganglia, nagios and iptables roles are now dependencies of the 'common' role.
This commit is contained in:
parent
1dfc4a8a79
commit
8023613031
|
@ -0,0 +1,894 @@
|
||||||
|
import re
|
||||||
|
import cgitb
|
||||||
|
import warnings
|
||||||
|
import urllib2
|
||||||
|
import sys
|
||||||
|
import logging
|
||||||
|
from string import Template
|
||||||
|
from urlparse import urlparse
|
||||||
|
from datetime import datetime
|
||||||
|
import uuid
|
||||||
|
import hashlib
|
||||||
|
import dateutil
|
||||||
|
import mimetypes
|
||||||
|
|
||||||
|
|
||||||
|
from pylons import config
|
||||||
|
from owslib import wms
|
||||||
|
import requests
|
||||||
|
from lxml import etree
|
||||||
|
|
||||||
|
from ckan import plugins as p
|
||||||
|
from ckan import model
|
||||||
|
from ckan.lib.helpers import json
|
||||||
|
from ckan import logic
|
||||||
|
from ckan.lib.navl.validators import not_empty
|
||||||
|
from ckan.lib.search.index import PackageSearchIndex
|
||||||
|
|
||||||
|
from ckanext.harvest.harvesters.base import HarvesterBase
|
||||||
|
from ckanext.harvest.model import HarvestObject
|
||||||
|
|
||||||
|
from ckanext.spatial.validation import Validators, all_validators
|
||||||
|
from ckanext.spatial.model import ISODocument
|
||||||
|
from ckanext.spatial.interfaces import ISpatialHarvester
|
||||||
|
|
||||||
|
log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
DEFAULT_VALIDATOR_PROFILES = ['iso19139']
|
||||||
|
|
||||||
|
|
||||||
|
def text_traceback():
|
||||||
|
with warnings.catch_warnings():
|
||||||
|
warnings.simplefilter("ignore")
|
||||||
|
res = 'the original traceback:'.join(
|
||||||
|
cgitb.text(sys.exc_info()).split('the original traceback:')[1:]
|
||||||
|
).strip()
|
||||||
|
return res
|
||||||
|
|
||||||
|
|
||||||
|
def guess_standard(content):
|
||||||
|
lowered = content.lower()
|
||||||
|
if '</gmd:MD_Metadata>'.lower() in lowered:
|
||||||
|
return 'iso'
|
||||||
|
if '</gmi:MI_Metadata>'.lower() in lowered:
|
||||||
|
return 'iso'
|
||||||
|
if '</metadata>'.lower() in lowered:
|
||||||
|
return 'fgdc'
|
||||||
|
return 'unknown'
|
||||||
|
|
||||||
|
|
||||||
|
def guess_resource_format(url, use_mimetypes=True):
|
||||||
|
'''
|
||||||
|
Given a URL try to guess the best format to assign to the resource
|
||||||
|
|
||||||
|
The function looks for common patterns in popular geospatial services and
|
||||||
|
file extensions, so it may not be 100% accurate. It just looks at the
|
||||||
|
provided URL, it does not attempt to perform any remote check.
|
||||||
|
|
||||||
|
if 'use_mimetypes' is True (default value), the mimetypes module will be
|
||||||
|
used if no match was found before.
|
||||||
|
|
||||||
|
Returns None if no format could be guessed.
|
||||||
|
|
||||||
|
'''
|
||||||
|
url = url.lower().strip()
|
||||||
|
|
||||||
|
resource_types = {
|
||||||
|
# OGC
|
||||||
|
'wms': ('service=wms', 'geoserver/wms', 'mapserver/wmsserver', 'com.esri.wms.Esrimap', 'service/wms'),
|
||||||
|
'wfs': ('service=wfs', 'geoserver/wfs', 'mapserver/wfsserver', 'com.esri.wfs.Esrimap'),
|
||||||
|
'wcs': ('service=wcs', 'geoserver/wcs', 'imageserver/wcsserver', 'mapserver/wcsserver'),
|
||||||
|
'sos': ('service=sos',),
|
||||||
|
'csw': ('service=csw',),
|
||||||
|
# ESRI
|
||||||
|
'kml': ('mapserver/generatekml',),
|
||||||
|
'arcims': ('com.esri.esrimap.esrimap',),
|
||||||
|
'arcgis_rest': ('arcgis/rest/services',),
|
||||||
|
}
|
||||||
|
|
||||||
|
for resource_type, parts in resource_types.iteritems():
|
||||||
|
if any(part in url for part in parts):
|
||||||
|
return resource_type
|
||||||
|
|
||||||
|
file_types = {
|
||||||
|
'kml' : ('kml',),
|
||||||
|
'kmz': ('kmz',),
|
||||||
|
'gml': ('gml',),
|
||||||
|
}
|
||||||
|
|
||||||
|
for file_type, extensions in file_types.iteritems():
|
||||||
|
if any(url.endswith(extension) for extension in extensions):
|
||||||
|
return file_type
|
||||||
|
|
||||||
|
resource_format, encoding = mimetypes.guess_type(url)
|
||||||
|
if resource_format:
|
||||||
|
return resource_format
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
class SpatialHarvester(HarvesterBase):
|
||||||
|
|
||||||
|
_user_name = None
|
||||||
|
|
||||||
|
_site_user = None
|
||||||
|
|
||||||
|
source_config = {}
|
||||||
|
|
||||||
|
force_import = False
|
||||||
|
|
||||||
|
extent_template = Template('''
|
||||||
|
{"type": "Polygon", "coordinates": [[[$xmin, $ymin], [$xmax, $ymin], [$xmax, $ymax], [$xmin, $ymax], [$xmin, $ymin]]]}
|
||||||
|
''')
|
||||||
|
|
||||||
|
## IHarvester
|
||||||
|
|
||||||
|
def validate_config(self, source_config):
|
||||||
|
if not source_config:
|
||||||
|
return source_config
|
||||||
|
|
||||||
|
try:
|
||||||
|
source_config_obj = json.loads(source_config)
|
||||||
|
|
||||||
|
if 'validator_profiles' in source_config_obj:
|
||||||
|
if not isinstance(source_config_obj['validator_profiles'], list):
|
||||||
|
raise ValueError('validator_profiles must be a list')
|
||||||
|
|
||||||
|
# Check if all profiles exist
|
||||||
|
existing_profiles = [v.name for v in all_validators]
|
||||||
|
unknown_profiles = set(source_config_obj['validator_profiles']) - set(existing_profiles)
|
||||||
|
|
||||||
|
if len(unknown_profiles) > 0:
|
||||||
|
raise ValueError('Unknown validation profile(s): %s' % ','.join(unknown_profiles))
|
||||||
|
|
||||||
|
if 'default_tags' in source_config_obj:
|
||||||
|
if not isinstance(source_config_obj['default_tags'],list):
|
||||||
|
raise ValueError('default_tags must be a list')
|
||||||
|
|
||||||
|
if 'default_extras' in source_config_obj:
|
||||||
|
if not isinstance(source_config_obj['default_extras'],dict):
|
||||||
|
raise ValueError('default_extras must be a dictionary')
|
||||||
|
|
||||||
|
for key in ('override_extras'):
|
||||||
|
if key in source_config_obj:
|
||||||
|
if not isinstance(source_config_obj[key],bool):
|
||||||
|
raise ValueError('%s must be boolean' % key)
|
||||||
|
|
||||||
|
except ValueError, e:
|
||||||
|
raise e
|
||||||
|
|
||||||
|
return source_config
|
||||||
|
|
||||||
|
##
|
||||||
|
|
||||||
|
## SpatialHarvester
|
||||||
|
|
||||||
|
|
||||||
|
def get_package_dict(self, iso_values, harvest_object):
|
||||||
|
'''
|
||||||
|
Constructs a package_dict suitable to be passed to package_create or
|
||||||
|
package_update. See documentation on
|
||||||
|
ckan.logic.action.create.package_create for more details
|
||||||
|
|
||||||
|
Extensions willing to modify the dict should do so implementing the
|
||||||
|
ISpatialHarvester interface
|
||||||
|
|
||||||
|
import ckan.plugins as p
|
||||||
|
from ckanext.spatial.interfaces import ISpatialHarvester
|
||||||
|
|
||||||
|
class MyHarvester(p.SingletonPlugin):
|
||||||
|
|
||||||
|
p.implements(ISpatialHarvester, inherit=True)
|
||||||
|
|
||||||
|
def get_package_dict(self, context, data_dict):
|
||||||
|
|
||||||
|
package_dict = data_dict['package_dict']
|
||||||
|
|
||||||
|
package_dict['extras'].append(
|
||||||
|
{'key': 'my-custom-extra', 'value': 'my-custom-value'}
|
||||||
|
)
|
||||||
|
|
||||||
|
return package_dict
|
||||||
|
|
||||||
|
If a dict is not returned by this function, the import stage will be cancelled.
|
||||||
|
|
||||||
|
:param iso_values: Dictionary with parsed values from the ISO 19139
|
||||||
|
XML document
|
||||||
|
:type iso_values: dict
|
||||||
|
:param harvest_object: HarvestObject domain object (with access to
|
||||||
|
job and source objects)
|
||||||
|
:type harvest_object: HarvestObject
|
||||||
|
|
||||||
|
:returns: A dataset dictionary (package_dict)
|
||||||
|
:rtype: dict
|
||||||
|
'''
|
||||||
|
|
||||||
|
tags = []
|
||||||
|
if 'tags' in iso_values:
|
||||||
|
for tag in iso_values['tags']:
|
||||||
|
tag = tag[:50] if len(tag) > 50 else tag
|
||||||
|
tags.append({'name': tag})
|
||||||
|
|
||||||
|
# Add default_tags from config
|
||||||
|
default_tags = self.source_config.get('default_tags',[])
|
||||||
|
if default_tags:
|
||||||
|
for tag in default_tags:
|
||||||
|
tags.append({'name': tag})
|
||||||
|
|
||||||
|
package_dict = {
|
||||||
|
'title': iso_values['title'],
|
||||||
|
'notes': iso_values['abstract'],
|
||||||
|
'tags': tags,
|
||||||
|
'resources': [],
|
||||||
|
}
|
||||||
|
|
||||||
|
# We need to get the owner organization (if any) from the harvest
|
||||||
|
# source dataset
|
||||||
|
source_dataset = model.Package.get(harvest_object.source.id)
|
||||||
|
if source_dataset.owner_org:
|
||||||
|
package_dict['owner_org'] = source_dataset.owner_org
|
||||||
|
|
||||||
|
# Package name
|
||||||
|
package = harvest_object.package
|
||||||
|
if package is None or package.title != iso_values['title']:
|
||||||
|
name = self._gen_new_name(iso_values['title'])
|
||||||
|
if not name:
|
||||||
|
name = self._gen_new_name(str(iso_values['guid']))
|
||||||
|
if not name:
|
||||||
|
raise Exception('Could not generate a unique name from the title or the GUID. Please choose a more unique title.')
|
||||||
|
package_dict['name'] = name
|
||||||
|
else:
|
||||||
|
package_dict['name'] = package.name
|
||||||
|
|
||||||
|
extras = {
|
||||||
|
'guid': harvest_object.guid,
|
||||||
|
'spatial_harvester': True,
|
||||||
|
}
|
||||||
|
|
||||||
|
# Just add some of the metadata as extras, not the whole lot
|
||||||
|
for name in [
|
||||||
|
# Essentials
|
||||||
|
'spatial-reference-system',
|
||||||
|
'guid',
|
||||||
|
# Usefuls
|
||||||
|
'dataset-reference-date',
|
||||||
|
'metadata-language', # Language
|
||||||
|
'metadata-date', # Released
|
||||||
|
'coupled-resource',
|
||||||
|
'contact-email',
|
||||||
|
'frequency-of-update',
|
||||||
|
'spatial-data-service-type',
|
||||||
|
]:
|
||||||
|
extras[name] = iso_values[name]
|
||||||
|
|
||||||
|
if len(iso_values.get('progress', [])):
|
||||||
|
extras['progress'] = iso_values['progress'][0]
|
||||||
|
else:
|
||||||
|
extras['progress'] = ''
|
||||||
|
|
||||||
|
if len(iso_values.get('resource-type', [])):
|
||||||
|
extras['resource-type'] = iso_values['resource-type'][0]
|
||||||
|
else:
|
||||||
|
extras['resource-type'] = ''
|
||||||
|
|
||||||
|
extras['licence'] = iso_values.get('use-constraints', '')
|
||||||
|
|
||||||
|
def _extract_first_license_url(licences):
|
||||||
|
for licence in licences:
|
||||||
|
o = urlparse(licence)
|
||||||
|
if o.scheme and o.netloc:
|
||||||
|
return licence
|
||||||
|
return None
|
||||||
|
|
||||||
|
if len(extras['licence']):
|
||||||
|
license_url_extracted = _extract_first_license_url(extras['licence'])
|
||||||
|
if license_url_extracted:
|
||||||
|
extras['licence_url'] = license_url_extracted
|
||||||
|
|
||||||
|
|
||||||
|
# Metadata license ID check for package
|
||||||
|
use_constraints = iso_values.get('use-constraints')
|
||||||
|
if use_constraints:
|
||||||
|
|
||||||
|
context = {'model': model, 'session': model.Session, 'user': self._get_user_name()}
|
||||||
|
license_list = p.toolkit.get_action('license_list')(context, {})
|
||||||
|
|
||||||
|
for constraint in use_constraints:
|
||||||
|
package_license = None
|
||||||
|
|
||||||
|
for license in license_list:
|
||||||
|
if constraint.lower() == license.get('id') or constraint == license.get('url'):
|
||||||
|
package_license = license.get('id')
|
||||||
|
break
|
||||||
|
|
||||||
|
if package_license:
|
||||||
|
package_dict['license_id'] = package_license
|
||||||
|
break
|
||||||
|
|
||||||
|
|
||||||
|
extras['access_constraints'] = iso_values.get('limitations-on-public-access', '')
|
||||||
|
|
||||||
|
# Grpahic preview
|
||||||
|
browse_graphic = iso_values.get('browse-graphic')
|
||||||
|
if browse_graphic:
|
||||||
|
browse_graphic = browse_graphic[0]
|
||||||
|
extras['graphic-preview-file'] = browse_graphic.get('file')
|
||||||
|
if browse_graphic.get('description'):
|
||||||
|
extras['graphic-preview-description'] = browse_graphic.get('description')
|
||||||
|
if browse_graphic.get('type'):
|
||||||
|
extras['graphic-preview-type'] = browse_graphic.get('type')
|
||||||
|
|
||||||
|
|
||||||
|
for key in ['temporal-extent-begin', 'temporal-extent-end']:
|
||||||
|
if len(iso_values[key]) > 0:
|
||||||
|
extras[key] = iso_values[key][0]
|
||||||
|
|
||||||
|
# Save responsible organization roles
|
||||||
|
if iso_values['responsible-organisation']:
|
||||||
|
parties = {}
|
||||||
|
for party in iso_values['responsible-organisation']:
|
||||||
|
if party['organisation-name'] in parties:
|
||||||
|
if not party['role'] in parties[party['organisation-name']]:
|
||||||
|
parties[party['organisation-name']].append(party['role'])
|
||||||
|
else:
|
||||||
|
parties[party['organisation-name']] = [party['role']]
|
||||||
|
extras['responsible-party'] = [{'name': k, 'roles': v} for k, v in parties.iteritems()]
|
||||||
|
|
||||||
|
if len(iso_values['bbox']) > 0:
|
||||||
|
bbox = iso_values['bbox'][0]
|
||||||
|
extras['bbox-east-long'] = bbox['east']
|
||||||
|
extras['bbox-north-lat'] = bbox['north']
|
||||||
|
extras['bbox-south-lat'] = bbox['south']
|
||||||
|
extras['bbox-west-long'] = bbox['west']
|
||||||
|
|
||||||
|
try:
|
||||||
|
xmin = float(bbox['west'])
|
||||||
|
xmax = float(bbox['east'])
|
||||||
|
ymin = float(bbox['south'])
|
||||||
|
ymax = float(bbox['north'])
|
||||||
|
except ValueError, e:
|
||||||
|
self._save_object_error('Error parsing bounding box value: {0}'.format(str(e)),
|
||||||
|
harvest_object, 'Import')
|
||||||
|
else:
|
||||||
|
# Construct a GeoJSON extent so ckanext-spatial can register the extent geometry
|
||||||
|
|
||||||
|
# Some publishers define the same two corners for the bbox (ie a point),
|
||||||
|
# that causes problems in the search if stored as polygon
|
||||||
|
if xmin == xmax or ymin == ymax:
|
||||||
|
extent_string = Template('{"type": "Point", "coordinates": [$x, $y]}').substitute(
|
||||||
|
x=xmin, y=ymin
|
||||||
|
)
|
||||||
|
self._save_object_error('Point extent defined instead of polygon',
|
||||||
|
harvest_object, 'Import')
|
||||||
|
else:
|
||||||
|
extent_string = self.extent_template.substitute(
|
||||||
|
xmin=xmin, ymin=ymin, xmax=xmax, ymax=ymax
|
||||||
|
)
|
||||||
|
|
||||||
|
extras['spatial'] = extent_string.strip()
|
||||||
|
else:
|
||||||
|
log.debug('No spatial extent defined for this object')
|
||||||
|
|
||||||
|
resource_locators = iso_values.get('resource-locator', []) +\
|
||||||
|
iso_values.get('resource-locator-identification', [])
|
||||||
|
|
||||||
|
if len(resource_locators):
|
||||||
|
for resource_locator in resource_locators:
|
||||||
|
url = resource_locator.get('url', '').strip()
|
||||||
|
if url:
|
||||||
|
resource = {}
|
||||||
|
resource['format'] = guess_resource_format(url)
|
||||||
|
if resource['format'] == 'wms' and config.get('ckanext.spatial.harvest.validate_wms', False):
|
||||||
|
# Check if the service is a view service
|
||||||
|
test_url = url.split('?')[0] if '?' in url else url
|
||||||
|
if self._is_wms(test_url):
|
||||||
|
resource['verified'] = True
|
||||||
|
resource['verified_date'] = datetime.now().isoformat()
|
||||||
|
|
||||||
|
resource.update(
|
||||||
|
{
|
||||||
|
'url': url,
|
||||||
|
'name': resource_locator.get('name') or p.toolkit._('Unnamed resource'),
|
||||||
|
'description': resource_locator.get('description') or '',
|
||||||
|
'resource_locator_protocol': resource_locator.get('protocol') or '',
|
||||||
|
'resource_locator_function': resource_locator.get('function') or '',
|
||||||
|
})
|
||||||
|
package_dict['resources'].append(resource)
|
||||||
|
|
||||||
|
|
||||||
|
# Add default_extras from config
|
||||||
|
default_extras = self.source_config.get('default_extras',{})
|
||||||
|
if default_extras:
|
||||||
|
override_extras = self.source_config.get('override_extras',False)
|
||||||
|
for key,value in default_extras.iteritems():
|
||||||
|
log.debug('Processing extra %s', key)
|
||||||
|
if not key in extras or override_extras:
|
||||||
|
# Look for replacement strings
|
||||||
|
if isinstance(value,basestring):
|
||||||
|
value = value.format(harvest_source_id=harvest_object.job.source.id,
|
||||||
|
harvest_source_url=harvest_object.job.source.url.strip('/'),
|
||||||
|
harvest_source_title=harvest_object.job.source.title,
|
||||||
|
harvest_job_id=harvest_object.job.id,
|
||||||
|
harvest_object_id=harvest_object.id)
|
||||||
|
extras[key] = value
|
||||||
|
|
||||||
|
extras_as_dict = []
|
||||||
|
for key, value in extras.iteritems():
|
||||||
|
if isinstance(value, (list, dict)):
|
||||||
|
extras_as_dict.append({'key': key, 'value': json.dumps(value)})
|
||||||
|
else:
|
||||||
|
extras_as_dict.append({'key': key, 'value': value})
|
||||||
|
|
||||||
|
package_dict['extras'] = extras_as_dict
|
||||||
|
|
||||||
|
return package_dict
|
||||||
|
|
||||||
|
def transform_to_iso(self, original_document, original_format, harvest_object):
|
||||||
|
'''
|
||||||
|
DEPRECATED: Use the transform_to_iso method of the ISpatialHarvester
|
||||||
|
interface
|
||||||
|
'''
|
||||||
|
self.__base_transform_to_iso_called = True
|
||||||
|
return None
|
||||||
|
|
||||||
|
def import_stage(self, harvest_object):
|
||||||
|
context = {
|
||||||
|
'model': model,
|
||||||
|
'session': model.Session,
|
||||||
|
'user': self._get_user_name(),
|
||||||
|
}
|
||||||
|
|
||||||
|
log = logging.getLogger(__name__ + '.import')
|
||||||
|
log.debug('Import stage for harvest object: %s', harvest_object.id)
|
||||||
|
|
||||||
|
if not harvest_object:
|
||||||
|
log.error('No harvest object received')
|
||||||
|
return False
|
||||||
|
|
||||||
|
self._set_source_config(harvest_object.source.config)
|
||||||
|
|
||||||
|
if self.force_import:
|
||||||
|
status = 'change'
|
||||||
|
else:
|
||||||
|
status = self._get_object_extra(harvest_object, 'status')
|
||||||
|
|
||||||
|
# Get the last harvested object (if any)
|
||||||
|
previous_object = model.Session.query(HarvestObject) \
|
||||||
|
.filter(HarvestObject.guid==harvest_object.guid) \
|
||||||
|
.filter(HarvestObject.current==True) \
|
||||||
|
.first()
|
||||||
|
|
||||||
|
if status == 'delete':
|
||||||
|
# Delete package
|
||||||
|
context.update({
|
||||||
|
'ignore_auth': True,
|
||||||
|
})
|
||||||
|
p.toolkit.get_action('package_delete')(context, {'id': harvest_object.package_id})
|
||||||
|
log.info('Deleted package {0} with guid {1}'.format(harvest_object.package_id, harvest_object.guid))
|
||||||
|
|
||||||
|
return True
|
||||||
|
|
||||||
|
# Check if it is a non ISO document
|
||||||
|
original_document = self._get_object_extra(harvest_object, 'original_document')
|
||||||
|
original_format = self._get_object_extra(harvest_object, 'original_format')
|
||||||
|
if original_document and original_format:
|
||||||
|
#DEPRECATED use the ISpatialHarvester interface method
|
||||||
|
self.__base_transform_to_iso_called = False
|
||||||
|
content = self.transform_to_iso(original_document, original_format, harvest_object)
|
||||||
|
if not self.__base_transform_to_iso_called:
|
||||||
|
log.warn('Deprecation warning: calling transform_to_iso directly is deprecated. ' +
|
||||||
|
'Please use the ISpatialHarvester interface method instead.')
|
||||||
|
|
||||||
|
for harvester in p.PluginImplementations(ISpatialHarvester):
|
||||||
|
content = harvester.transform_to_iso(original_document, original_format, harvest_object)
|
||||||
|
|
||||||
|
if content:
|
||||||
|
harvest_object.content = content
|
||||||
|
else:
|
||||||
|
self._save_object_error('Transformation to ISO failed', harvest_object, 'Import')
|
||||||
|
return False
|
||||||
|
else:
|
||||||
|
if harvest_object.content is None:
|
||||||
|
self._save_object_error('Empty content for object {0}'.format(harvest_object.id), harvest_object, 'Import')
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Validate ISO document
|
||||||
|
is_valid, profile, errors = self._validate_document(harvest_object.content, harvest_object)
|
||||||
|
if not is_valid:
|
||||||
|
# If validation errors were found, import will stop unless
|
||||||
|
# configuration per source or per instance says otherwise
|
||||||
|
continue_import = p.toolkit.asbool(config.get('ckanext.spatial.harvest.continue_on_validation_errors', False)) or \
|
||||||
|
self.source_config.get('continue_on_validation_errors')
|
||||||
|
if not continue_import:
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Parse ISO document
|
||||||
|
try:
|
||||||
|
|
||||||
|
iso_parser = ISODocument(harvest_object.content)
|
||||||
|
iso_values = iso_parser.read_values()
|
||||||
|
except Exception, e:
|
||||||
|
self._save_object_error('Error parsing ISO document for object {0}: {1}'.format(harvest_object.id, str(e)),
|
||||||
|
harvest_object, 'Import')
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Flag previous object as not current anymore
|
||||||
|
if previous_object and not self.force_import:
|
||||||
|
previous_object.current = False
|
||||||
|
previous_object.add()
|
||||||
|
|
||||||
|
# Update GUID with the one on the document
|
||||||
|
iso_guid = iso_values['guid']
|
||||||
|
if iso_guid and harvest_object.guid != iso_guid:
|
||||||
|
# First make sure there already aren't current objects
|
||||||
|
# with the same guid
|
||||||
|
existing_object = model.Session.query(HarvestObject.id) \
|
||||||
|
.filter(HarvestObject.guid==iso_guid) \
|
||||||
|
.filter(HarvestObject.current==True) \
|
||||||
|
.first()
|
||||||
|
if existing_object:
|
||||||
|
self._save_object_error('Object {0} already has this guid {1}'.format(existing_object.id, iso_guid),
|
||||||
|
harvest_object, 'Import')
|
||||||
|
return False
|
||||||
|
|
||||||
|
harvest_object.guid = iso_guid
|
||||||
|
harvest_object.add()
|
||||||
|
|
||||||
|
# Generate GUID if not present (i.e. it's a manual import)
|
||||||
|
if not harvest_object.guid:
|
||||||
|
m = hashlib.md5()
|
||||||
|
m.update(harvest_object.content.encode('utf8', 'ignore'))
|
||||||
|
harvest_object.guid = m.hexdigest()
|
||||||
|
harvest_object.add()
|
||||||
|
|
||||||
|
# Get document modified date
|
||||||
|
try:
|
||||||
|
metadata_modified_date = dateutil.parser.parse(iso_values['metadata-date'], ignoretz=True)
|
||||||
|
except ValueError:
|
||||||
|
self._save_object_error('Could not extract reference date for object {0} ({1})'
|
||||||
|
.format(harvest_object.id, iso_values['metadata-date']), harvest_object, 'Import')
|
||||||
|
return False
|
||||||
|
|
||||||
|
harvest_object.metadata_modified_date = metadata_modified_date
|
||||||
|
harvest_object.add()
|
||||||
|
|
||||||
|
|
||||||
|
# Build the package dict
|
||||||
|
package_dict = self.get_package_dict(iso_values, harvest_object)
|
||||||
|
for harvester in p.PluginImplementations(ISpatialHarvester):
|
||||||
|
package_dict = harvester.get_package_dict(context, {
|
||||||
|
'package_dict': package_dict,
|
||||||
|
'iso_values': iso_values,
|
||||||
|
'xml_tree': iso_parser.xml_tree,
|
||||||
|
'harvest_object': harvest_object,
|
||||||
|
})
|
||||||
|
if not package_dict:
|
||||||
|
log.error('No package dict returned, aborting import for object {0}'.format(harvest_object.id))
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Create / update the package
|
||||||
|
context.update({
|
||||||
|
'extras_as_string': True,
|
||||||
|
'api_version': '2',
|
||||||
|
'return_id_only': True})
|
||||||
|
|
||||||
|
if self._site_user and context['user'] == self._site_user['name']:
|
||||||
|
context['ignore_auth'] = True
|
||||||
|
|
||||||
|
|
||||||
|
# The default package schema does not like Upper case tags
|
||||||
|
tag_schema = logic.schema.default_tags_schema()
|
||||||
|
tag_schema['name'] = [not_empty, unicode]
|
||||||
|
|
||||||
|
# Flag this object as the current one
|
||||||
|
harvest_object.current = True
|
||||||
|
harvest_object.add()
|
||||||
|
|
||||||
|
if status == 'new':
|
||||||
|
package_schema = logic.schema.default_create_package_schema()
|
||||||
|
package_schema['tags'] = tag_schema
|
||||||
|
context['schema'] = package_schema
|
||||||
|
|
||||||
|
# We need to explicitly provide a package ID, otherwise ckanext-spatial
|
||||||
|
# won't be be able to link the extent to the package.
|
||||||
|
package_dict['id'] = unicode(uuid.uuid4())
|
||||||
|
package_schema['id'] = [unicode]
|
||||||
|
|
||||||
|
# Save reference to the package on the object
|
||||||
|
harvest_object.package_id = package_dict['id']
|
||||||
|
harvest_object.add()
|
||||||
|
# Defer constraints and flush so the dataset can be indexed with
|
||||||
|
# the harvest object id (on the after_show hook from the harvester
|
||||||
|
# plugin)
|
||||||
|
model.Session.execute('SET CONSTRAINTS harvest_object_package_id_fkey DEFERRED')
|
||||||
|
model.Session.flush()
|
||||||
|
|
||||||
|
try:
|
||||||
|
package_id = p.toolkit.get_action('package_create')(context, package_dict)
|
||||||
|
log.info('Created new package %s with guid %s', package_id, harvest_object.guid)
|
||||||
|
except p.toolkit.ValidationError, e:
|
||||||
|
self._save_object_error('Validation Error: %s' % str(e.error_summary), harvest_object, 'Import')
|
||||||
|
return False
|
||||||
|
|
||||||
|
elif status == 'change':
|
||||||
|
|
||||||
|
# Check if the modified date is more recent
|
||||||
|
if not self.force_import and previous_object and harvest_object.metadata_modified_date <= previous_object.metadata_modified_date:
|
||||||
|
|
||||||
|
# Assign the previous job id to the new object to
|
||||||
|
# avoid losing history
|
||||||
|
harvest_object.harvest_job_id = previous_object.job.id
|
||||||
|
harvest_object.add()
|
||||||
|
|
||||||
|
# Delete the previous object to avoid cluttering the object table
|
||||||
|
previous_object.delete()
|
||||||
|
|
||||||
|
# Reindex the corresponding package to update the reference to the
|
||||||
|
# harvest object
|
||||||
|
if ((config.get('ckanext.spatial.harvest.reindex_unchanged', True) != 'False'
|
||||||
|
or self.source_config.get('reindex_unchanged') != 'False')
|
||||||
|
and harvest_object.package_id):
|
||||||
|
context.update({'validate': False, 'ignore_auth': True})
|
||||||
|
try:
|
||||||
|
package_dict = logic.get_action('package_show')(context,
|
||||||
|
{'id': harvest_object.package_id})
|
||||||
|
except p.toolkit.ObjectNotFound:
|
||||||
|
pass
|
||||||
|
else:
|
||||||
|
for extra in package_dict.get('extras', []):
|
||||||
|
if extra['key'] == 'harvest_object_id':
|
||||||
|
extra['value'] = harvest_object.id
|
||||||
|
if package_dict:
|
||||||
|
package_index = PackageSearchIndex()
|
||||||
|
package_index.index_package(package_dict)
|
||||||
|
|
||||||
|
log.info('Document with GUID %s unchanged, skipping...' % (harvest_object.guid))
|
||||||
|
else:
|
||||||
|
package_schema = logic.schema.default_update_package_schema()
|
||||||
|
package_schema['tags'] = tag_schema
|
||||||
|
context['schema'] = package_schema
|
||||||
|
|
||||||
|
package_dict['id'] = harvest_object.package_id
|
||||||
|
try:
|
||||||
|
package_id = p.toolkit.get_action('package_update')(context, package_dict)
|
||||||
|
log.info('Updated package %s with guid %s', package_id, harvest_object.guid)
|
||||||
|
except p.toolkit.ValidationError, e:
|
||||||
|
self._save_object_error('Validation Error: %s' % str(e.error_summary), harvest_object, 'Import')
|
||||||
|
return False
|
||||||
|
|
||||||
|
model.Session.commit()
|
||||||
|
|
||||||
|
return True
|
||||||
|
##
|
||||||
|
|
||||||
|
def _is_wms(self, url):
|
||||||
|
'''
|
||||||
|
Checks if the provided URL actually points to a Web Map Service.
|
||||||
|
Uses owslib WMS reader to parse the response.
|
||||||
|
'''
|
||||||
|
try:
|
||||||
|
capabilities_url = wms.WMSCapabilitiesReader().capabilities_url(url)
|
||||||
|
res = urllib2.urlopen(capabilities_url, None, 10)
|
||||||
|
xml = res.read()
|
||||||
|
|
||||||
|
s = wms.WebMapService(url, xml=xml)
|
||||||
|
return isinstance(s.contents, dict) and s.contents != {}
|
||||||
|
except Exception, e:
|
||||||
|
log.error('WMS check for %s failed with exception: %s' % (url, str(e)))
|
||||||
|
return False
|
||||||
|
|
||||||
|
def _get_object_extra(self, harvest_object, key):
|
||||||
|
'''
|
||||||
|
Helper function for retrieving the value from a harvest object extra,
|
||||||
|
given the key
|
||||||
|
'''
|
||||||
|
for extra in harvest_object.extras:
|
||||||
|
if extra.key == key:
|
||||||
|
return extra.value
|
||||||
|
return None
|
||||||
|
|
||||||
|
def _set_source_config(self, config_str):
|
||||||
|
'''
|
||||||
|
Loads the source configuration JSON object into a dict for
|
||||||
|
convenient access
|
||||||
|
'''
|
||||||
|
if config_str:
|
||||||
|
self.source_config = json.loads(config_str)
|
||||||
|
log.debug('Using config: %r', self.source_config)
|
||||||
|
else:
|
||||||
|
self.source_config = {}
|
||||||
|
|
||||||
|
def _get_validator(self):
|
||||||
|
'''
|
||||||
|
Returns the validator object using the relevant profiles
|
||||||
|
|
||||||
|
The profiles to be used are assigned in the following order:
|
||||||
|
|
||||||
|
1. 'validator_profiles' property of the harvest source config object
|
||||||
|
2. 'ckan.spatial.validator.profiles' configuration option in the ini file
|
||||||
|
3. Default value as defined in DEFAULT_VALIDATOR_PROFILES
|
||||||
|
'''
|
||||||
|
if not hasattr(self, '_validator'):
|
||||||
|
if hasattr(self, 'source_config') and self.source_config.get('validator_profiles', None):
|
||||||
|
profiles = self.source_config.get('validator_profiles')
|
||||||
|
elif config.get('ckan.spatial.validator.profiles', None):
|
||||||
|
profiles = [
|
||||||
|
x.strip() for x in
|
||||||
|
config.get('ckan.spatial.validator.profiles').split(',')
|
||||||
|
]
|
||||||
|
else:
|
||||||
|
profiles = DEFAULT_VALIDATOR_PROFILES
|
||||||
|
self._validator = Validators(profiles=profiles)
|
||||||
|
|
||||||
|
# Add any custom validators from extensions
|
||||||
|
for plugin_with_validators in p.PluginImplementations(ISpatialHarvester):
|
||||||
|
custom_validators = plugin_with_validators.get_validators()
|
||||||
|
for custom_validator in custom_validators:
|
||||||
|
if custom_validator not in all_validators:
|
||||||
|
self._validator.add_validator(custom_validator)
|
||||||
|
|
||||||
|
|
||||||
|
return self._validator
|
||||||
|
|
||||||
|
def _get_user_name(self):
|
||||||
|
'''
|
||||||
|
Returns the name of the user that will perform the harvesting actions
|
||||||
|
(deleting, updating and creating datasets)
|
||||||
|
|
||||||
|
By default this will be the internal site admin user. This is the
|
||||||
|
recommended setting, but if necessary it can be overridden with the
|
||||||
|
`ckanext.spatial.harvest.user_name` config option, eg to support the
|
||||||
|
old hardcoded 'harvest' user:
|
||||||
|
|
||||||
|
ckanext.spatial.harvest.user_name = harvest
|
||||||
|
|
||||||
|
'''
|
||||||
|
if self._user_name:
|
||||||
|
return self._user_name
|
||||||
|
|
||||||
|
context = {'model': model,
|
||||||
|
'ignore_auth': True,
|
||||||
|
'defer_commit': True, # See ckan/ckan#1714
|
||||||
|
}
|
||||||
|
self._site_user = p.toolkit.get_action('get_site_user')(context, {})
|
||||||
|
|
||||||
|
config_user_name = config.get('ckanext.spatial.harvest.user_name')
|
||||||
|
if config_user_name:
|
||||||
|
self._user_name = config_user_name
|
||||||
|
else:
|
||||||
|
self._user_name = self._site_user['name']
|
||||||
|
|
||||||
|
return self._user_name
|
||||||
|
|
||||||
|
def _get_content(self, url):
|
||||||
|
'''
|
||||||
|
DEPRECATED: Use _get_content_as_unicode instead
|
||||||
|
'''
|
||||||
|
|
||||||
|
parts = urlparse.urlparse(url)
|
||||||
|
if parts.username and parts.password:
|
||||||
|
auth_url = url.rsplit('/', 1)[0]
|
||||||
|
auth_url = auth_url + '/xml.user.login'
|
||||||
|
auth_url = urlparse.urlunparse((
|
||||||
|
parts.scheme,
|
||||||
|
parts.netloc,
|
||||||
|
parts.path
|
||||||
|
))
|
||||||
|
log.error('Authenticate agains Geonetwork. User is %s and password is %s', parts.username, parts.password)
|
||||||
|
auth_data = minidom.Document()
|
||||||
|
root = auth_data.createElement('request')
|
||||||
|
auth_data.appendChild(root)
|
||||||
|
username_tag = auth_data.createElement('username')
|
||||||
|
user_data = auth_data.createTextNode(parts.username)
|
||||||
|
username_tag.appendChild(user_data)
|
||||||
|
root.appendChild(username_tag)
|
||||||
|
password_tag = auth_data.createElement('password')
|
||||||
|
password_data = auth_data.createTextNode(parts.password)
|
||||||
|
password_tag.appendChild(password_data)
|
||||||
|
root.appendChild(password_tag)
|
||||||
|
xml_auth_data = auth_data.toprettyxml(indent=" ")
|
||||||
|
|
||||||
|
req_headers = {'Content-Type': 'application/xml'}
|
||||||
|
|
||||||
|
sess = requests.Session()
|
||||||
|
req = sess.post(url=auth_url, data=xml_auth_data, headers=req_headers)
|
||||||
|
opener = urllib2.build_opener()
|
||||||
|
opener.addheaders.append(('Set-Cookie', req.cookie))
|
||||||
|
|
||||||
|
url = url.replace(' ', '%20')
|
||||||
|
if opener:
|
||||||
|
http_response = opener.open(url)
|
||||||
|
else:
|
||||||
|
http_response = urllib2.urlopen(url)
|
||||||
|
return http_response.read()
|
||||||
|
|
||||||
|
def _get_content_as_unicode(self, url):
|
||||||
|
'''
|
||||||
|
Get remote content as unicode.
|
||||||
|
|
||||||
|
We let requests handle the conversion [1] , which will use the
|
||||||
|
content-type header first or chardet if the header is missing
|
||||||
|
(requests uses its own embedded chardet version).
|
||||||
|
|
||||||
|
As we will be storing and serving the contents as unicode, we actually
|
||||||
|
replace the original XML encoding declaration with an UTF-8 one.
|
||||||
|
|
||||||
|
|
||||||
|
[1] http://github.com/kennethreitz/requests/blob/63243b1e3b435c7736acf1e51c0f6fa6666d861d/requests/models.py#L811
|
||||||
|
|
||||||
|
'''
|
||||||
|
parts = urlparse.urlparse(url)
|
||||||
|
if parts.username and parts.password:
|
||||||
|
auth_url = url.rsplit('/', 1)[0]
|
||||||
|
auth_url = auth_url + '/xml.user.login'
|
||||||
|
auth_url = urlparse.urlunparse((
|
||||||
|
parts.scheme,
|
||||||
|
parts.netloc,
|
||||||
|
parts.path
|
||||||
|
))
|
||||||
|
log.error('Authenticate against Geonetwork. User is %s and password is %s', parts.username, parts.password)
|
||||||
|
auth_data = minidom.Document()
|
||||||
|
root = auth_data.createElement('request')
|
||||||
|
auth_data.appendChild(root)
|
||||||
|
username_tag = auth_data.createElement('username')
|
||||||
|
user_data = auth_data.createTextNode(parts.username)
|
||||||
|
username_tag.appendChild(user_data)
|
||||||
|
root.appendChild(username_tag)
|
||||||
|
password_tag = auth_data.createElement('password')
|
||||||
|
password_data = auth_data.createTextNode(parts.password)
|
||||||
|
password_tag.appendChild(password_data)
|
||||||
|
root.appendChild(password_tag)
|
||||||
|
xml_auth_data = auth_data.toprettyxml(indent=" ")
|
||||||
|
|
||||||
|
req_headers = {'Content-Type': 'application/xml'}
|
||||||
|
|
||||||
|
geo_session = requests.Session()
|
||||||
|
geo_session.post(url=auth_url, data=xml_auth_data, headers=req_headers)
|
||||||
|
|
||||||
|
url = url.replace(' ', '%20')
|
||||||
|
if geo_session:
|
||||||
|
response = geo_session.get(url, timeout=10)
|
||||||
|
else:
|
||||||
|
response = requests.get(url, timeout=10)
|
||||||
|
|
||||||
|
content = response.text
|
||||||
|
|
||||||
|
# Remove original XML declaration
|
||||||
|
content = re.sub('<\?xml(.*)\?>', '', content)
|
||||||
|
|
||||||
|
# Get rid of the BOM and other rubbish at the beginning of the file
|
||||||
|
content = re.sub('.*?<', '<', content, 1)
|
||||||
|
content = content[content.index('<'):]
|
||||||
|
|
||||||
|
return content
|
||||||
|
|
||||||
|
def _validate_document(self, document_string, harvest_object, validator=None):
|
||||||
|
'''
|
||||||
|
Validates an XML document with the default, or if present, the
|
||||||
|
provided validators.
|
||||||
|
|
||||||
|
It will create a HarvestObjectError for each validation error found,
|
||||||
|
so they can be shown properly on the frontend.
|
||||||
|
|
||||||
|
Returns a tuple, with a boolean showing whether the validation passed
|
||||||
|
or not, the profile used and a list of errors (tuples with error
|
||||||
|
message and error lines if present).
|
||||||
|
'''
|
||||||
|
if not validator:
|
||||||
|
validator = self._get_validator()
|
||||||
|
|
||||||
|
document_string = re.sub('<\?xml(.*)\?>', '', document_string)
|
||||||
|
|
||||||
|
try:
|
||||||
|
xml = etree.fromstring(document_string)
|
||||||
|
except etree.XMLSyntaxError, e:
|
||||||
|
self._save_object_error('Could not parse XML file: {0}'.format(str(e)), harvest_object, 'Import')
|
||||||
|
return False, None, []
|
||||||
|
|
||||||
|
valid, profile, errors = validator.is_valid(xml)
|
||||||
|
if not valid:
|
||||||
|
log.error('Validation errors found using profile {0} for object with GUID {1}'.format(profile, harvest_object.guid))
|
||||||
|
for error in errors:
|
||||||
|
self._save_object_error(error[0], harvest_object, 'Validation', line=error[1])
|
||||||
|
|
||||||
|
return valid, profile, errors
|
|
@ -0,0 +1,187 @@
|
||||||
|
<?xml version="1.0" encoding="UTF-8" ?>
|
||||||
|
<!--
|
||||||
|
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
contributor license agreements. See the NOTICE file distributed with
|
||||||
|
this work for additional information regarding copyright ownership.
|
||||||
|
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
(the "License"); you may not use this file except in compliance with
|
||||||
|
the License. You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
-->
|
||||||
|
|
||||||
|
<!--
|
||||||
|
NB Please copy changes to this file into the multilingual schema:
|
||||||
|
ckanext/multilingual/solr/schema.xml
|
||||||
|
-->
|
||||||
|
|
||||||
|
<!-- We update the version when there is a backward-incompatible change to this
|
||||||
|
schema. In this case the version should be set to the next CKAN version number.
|
||||||
|
(x.y but not x.y.z since it needs to be a float) -->
|
||||||
|
<schema name="ckan" version="2.3">
|
||||||
|
|
||||||
|
<types>
|
||||||
|
<fieldType name="string" class="solr.StrField" sortMissingLast="true" omitNorms="true"/>
|
||||||
|
<fieldType name="boolean" class="solr.BoolField" sortMissingLast="true" omitNorms="true"/>
|
||||||
|
<fieldtype name="binary" class="solr.BinaryField"/>
|
||||||
|
<fieldType name="int" class="solr.TrieIntField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/>
|
||||||
|
<fieldType name="float" class="solr.TrieFloatField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/>
|
||||||
|
<fieldType name="long" class="solr.TrieLongField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/>
|
||||||
|
<fieldType name="double" class="solr.TrieDoubleField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/>
|
||||||
|
<fieldType name="tint" class="solr.TrieIntField" precisionStep="8" omitNorms="true" positionIncrementGap="0"/>
|
||||||
|
<fieldType name="tfloat" class="solr.TrieFloatField" precisionStep="8" omitNorms="true" positionIncrementGap="0"/>
|
||||||
|
<fieldType name="tlong" class="solr.TrieLongField" precisionStep="8" omitNorms="true" positionIncrementGap="0"/>
|
||||||
|
<fieldType name="tdouble" class="solr.TrieDoubleField" precisionStep="8" omitNorms="true" positionIncrementGap="0"/>
|
||||||
|
<fieldType name="date" class="solr.TrieDateField" omitNorms="true" precisionStep="0" positionIncrementGap="0"/>
|
||||||
|
<fieldType name="tdate" class="solr.TrieDateField" omitNorms="true" precisionStep="6" positionIncrementGap="0"/>
|
||||||
|
|
||||||
|
<fieldType name="text" class="solr.TextField" positionIncrementGap="100">
|
||||||
|
<analyzer type="index">
|
||||||
|
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
|
||||||
|
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0" splitOnCaseChange="1"/>
|
||||||
|
<filter class="solr.LowerCaseFilterFactory"/>
|
||||||
|
<filter class="solr.SnowballPorterFilterFactory" language="English" protected="protwords.txt"/>
|
||||||
|
<filter class="solr.ASCIIFoldingFilterFactory"/>
|
||||||
|
</analyzer>
|
||||||
|
<analyzer type="query">
|
||||||
|
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
|
||||||
|
<filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
|
||||||
|
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="1"/>
|
||||||
|
<filter class="solr.LowerCaseFilterFactory"/>
|
||||||
|
<filter class="solr.SnowballPorterFilterFactory" language="English" protected="protwords.txt"/>
|
||||||
|
<filter class="solr.ASCIIFoldingFilterFactory"/>
|
||||||
|
</analyzer>
|
||||||
|
</fieldType>
|
||||||
|
|
||||||
|
|
||||||
|
<!-- A general unstemmed text field - good if one does not know the language of the field -->
|
||||||
|
<fieldType name="textgen" class="solr.TextField" positionIncrementGap="100">
|
||||||
|
<analyzer type="index">
|
||||||
|
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
|
||||||
|
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0" splitOnCaseChange="0"/>
|
||||||
|
<filter class="solr.LowerCaseFilterFactory"/>
|
||||||
|
</analyzer>
|
||||||
|
<analyzer type="query">
|
||||||
|
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
|
||||||
|
<filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
|
||||||
|
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="0"/>
|
||||||
|
<filter class="solr.LowerCaseFilterFactory"/>
|
||||||
|
</analyzer>
|
||||||
|
</fieldType>
|
||||||
|
</types>
|
||||||
|
|
||||||
|
|
||||||
|
<fields>
|
||||||
|
<field name="index_id" type="string" indexed="true" stored="true" required="true" />
|
||||||
|
<field name="id" type="string" indexed="true" stored="true" required="true" />
|
||||||
|
<field name="site_id" type="string" indexed="true" stored="true" required="true" />
|
||||||
|
<field name="title" type="text" indexed="true" stored="true" />
|
||||||
|
<field name="entity_type" type="string" indexed="true" stored="true" omitNorms="true" />
|
||||||
|
<field name="dataset_type" type="string" indexed="true" stored="true" />
|
||||||
|
<field name="state" type="string" indexed="true" stored="true" omitNorms="true" />
|
||||||
|
<field name="name" type="string" indexed="true" stored="true" omitNorms="true" />
|
||||||
|
<field name="revision_id" type="string" indexed="true" stored="true" omitNorms="true" />
|
||||||
|
<field name="version" type="string" indexed="true" stored="true" />
|
||||||
|
<field name="url" type="string" indexed="true" stored="true" omitNorms="true" />
|
||||||
|
<field name="ckan_url" type="string" indexed="true" stored="true" omitNorms="true" />
|
||||||
|
<field name="download_url" type="string" indexed="true" stored="true" omitNorms="true" />
|
||||||
|
<field name="notes" type="text" indexed="true" stored="true"/>
|
||||||
|
<field name="author" type="textgen" indexed="true" stored="true" />
|
||||||
|
<field name="author_email" type="textgen" indexed="true" stored="true" />
|
||||||
|
<field name="maintainer" type="textgen" indexed="true" stored="true" />
|
||||||
|
<field name="maintainer_email" type="textgen" indexed="true" stored="true" />
|
||||||
|
<field name="license" type="string" indexed="true" stored="true" />
|
||||||
|
<field name="license_id" type="string" indexed="true" stored="true" />
|
||||||
|
<field name="ratings_count" type="int" indexed="true" stored="false" />
|
||||||
|
<field name="ratings_average" type="float" indexed="true" stored="false" />
|
||||||
|
<field name="tags" type="string" indexed="true" stored="true" multiValued="true"/>
|
||||||
|
<field name="groups" type="string" indexed="true" stored="true" multiValued="true"/>
|
||||||
|
<field name="organization" type="string" indexed="true" stored="true" multiValued="false"/>
|
||||||
|
|
||||||
|
<field name="capacity" type="string" indexed="true" stored="true" multiValued="false"/>
|
||||||
|
|
||||||
|
<field name="res_name" type="textgen" indexed="true" stored="true" multiValued="true" />
|
||||||
|
<field name="res_description" type="textgen" indexed="true" stored="true" multiValued="true"/>
|
||||||
|
<field name="res_format" type="string" indexed="true" stored="true" multiValued="true"/>
|
||||||
|
<field name="res_url" type="string" indexed="true" stored="true" multiValued="true"/>
|
||||||
|
<field name="res_type" type="string" indexed="true" stored="true" multiValued="true"/>
|
||||||
|
|
||||||
|
<!-- Fields needed by the spatial extension-->
|
||||||
|
<field name="bbox_area" type="float" indexed="true" stored="true" />
|
||||||
|
<field name="maxx" type="float" indexed="true" stored="true" />
|
||||||
|
<field name="maxy" type="float" indexed="true" stored="true" />
|
||||||
|
<field name="minx" type="float" indexed="true" stored="true" />
|
||||||
|
<field name="miny" type="float" indexed="true" stored="true" />
|
||||||
|
|
||||||
|
<!-- catchall field, containing all other searchable text fields (implemented
|
||||||
|
via copyField further on in this schema -->
|
||||||
|
<field name="text" type="text" indexed="true" stored="false" multiValued="true"/>
|
||||||
|
<field name="urls" type="text" indexed="true" stored="false" multiValued="true"/>
|
||||||
|
|
||||||
|
<field name="depends_on" type="text" indexed="true" stored="false" multiValued="true"/>
|
||||||
|
<field name="dependency_of" type="text" indexed="true" stored="false" multiValued="true"/>
|
||||||
|
<field name="derives_from" type="text" indexed="true" stored="false" multiValued="true"/>
|
||||||
|
<field name="has_derivation" type="text" indexed="true" stored="false" multiValued="true"/>
|
||||||
|
<field name="links_to" type="text" indexed="true" stored="false" multiValued="true"/>
|
||||||
|
<field name="linked_from" type="text" indexed="true" stored="false" multiValued="true"/>
|
||||||
|
<field name="child_of" type="text" indexed="true" stored="false" multiValued="true"/>
|
||||||
|
<field name="parent_of" type="text" indexed="true" stored="false" multiValued="true"/>
|
||||||
|
<field name="views_total" type="int" indexed="true" stored="false"/>
|
||||||
|
<field name="views_recent" type="int" indexed="true" stored="false"/>
|
||||||
|
<field name="resources_accessed_total" type="int" indexed="true" stored="false"/>
|
||||||
|
<field name="resources_accessed_recent" type="int" indexed="true" stored="false"/>
|
||||||
|
|
||||||
|
<field name="metadata_created" type="date" indexed="true" stored="true" multiValued="false"/>
|
||||||
|
<field name="metadata_modified" type="date" indexed="true" stored="true" multiValued="false"/>
|
||||||
|
|
||||||
|
<field name="indexed_ts" type="date" indexed="true" stored="true" default="NOW" multiValued="false"/>
|
||||||
|
|
||||||
|
<!-- Copy the title field into titleString, and treat as a string
|
||||||
|
(rather than text type). This allows us to sort on the titleString -->
|
||||||
|
<field name="title_string" type="string" indexed="true" stored="false" />
|
||||||
|
|
||||||
|
<field name="data_dict" type="string" indexed="false" stored="true" />
|
||||||
|
<field name="validated_data_dict" type="string" indexed="false" stored="true" />
|
||||||
|
|
||||||
|
<field name="_version_" type="string" indexed="true" stored="true"/>
|
||||||
|
|
||||||
|
<dynamicField name="*_date" type="date" indexed="true" stored="true" multiValued="false"/>
|
||||||
|
|
||||||
|
<dynamicField name="extras_*" type="text" indexed="true" stored="true" multiValued="false"/>
|
||||||
|
<dynamicField name="res_extras_*" type="text" indexed="true" stored="true" multiValued="true"/>
|
||||||
|
<dynamicField name="vocab_*" type="string" indexed="true" stored="true" multiValued="true"/>
|
||||||
|
<dynamicField name="*" type="string" indexed="true" stored="false"/>
|
||||||
|
</fields>
|
||||||
|
|
||||||
|
<uniqueKey>index_id</uniqueKey>
|
||||||
|
<defaultSearchField>text</defaultSearchField>
|
||||||
|
<solrQueryParser defaultOperator="AND"/>
|
||||||
|
|
||||||
|
<copyField source="url" dest="urls"/>
|
||||||
|
<copyField source="ckan_url" dest="urls"/>
|
||||||
|
<copyField source="download_url" dest="urls"/>
|
||||||
|
<copyField source="res_url" dest="urls"/>
|
||||||
|
<copyField source="extras_*" dest="text"/>
|
||||||
|
<copyField source="res_extras_*" dest="text"/>
|
||||||
|
<copyField source="vocab_*" dest="text"/>
|
||||||
|
<copyField source="urls" dest="text"/>
|
||||||
|
<copyField source="name" dest="text"/>
|
||||||
|
<copyField source="title" dest="text"/>
|
||||||
|
<copyField source="text" dest="text"/>
|
||||||
|
<copyField source="license" dest="text"/>
|
||||||
|
<copyField source="notes" dest="text"/>
|
||||||
|
<copyField source="tags" dest="text"/>
|
||||||
|
<copyField source="groups" dest="text"/>
|
||||||
|
<copyField source="organization" dest="text"/>
|
||||||
|
<copyField source="res_name" dest="text"/>
|
||||||
|
<copyField source="res_description" dest="text"/>
|
||||||
|
<copyField source="maintainer" dest="text"/>
|
||||||
|
<copyField source="author" dest="text"/>
|
||||||
|
|
||||||
|
</schema>
|
|
@ -0,0 +1,6 @@
|
||||||
|
---
|
||||||
|
- name: Solr Restart
|
||||||
|
service: name=tomcat-instance-{{ ckan_solr_port }} state=restarted
|
||||||
|
|
||||||
|
- name: Restart CKAN
|
||||||
|
service: name=apache2 state=restarted sleep=10
|
|
@ -0,0 +1,181 @@
|
||||||
|
---
|
||||||
|
- name: Download the CKAN distribution
|
||||||
|
get_url: url='{{ ckan_package_url }}' dest=/srv/{{ ckan_deb_file }}
|
||||||
|
tags: ckan
|
||||||
|
|
||||||
|
- name: Install the CKAN deb package
|
||||||
|
apt: deb=/srv/{{ ckan_deb_file }}
|
||||||
|
register: ckan_install
|
||||||
|
tags: ckan
|
||||||
|
|
||||||
|
- name: Configure the CKAN production configuration file
|
||||||
|
ini_file: dest={{ ckan_config_file }} section={{ item.section }} option={{ item.option }} value={{ item.value }} state={{ item.state }} backup=yes
|
||||||
|
with_items: '{{ ckan_production_ini_opts }}'
|
||||||
|
notify: Restart CKAN
|
||||||
|
tags: [ 'ckan', 'ckan_ini' ]
|
||||||
|
|
||||||
|
- name: Install the solr schema used by CKAN
|
||||||
|
file: src=/usr/lib/ckan/default/src/ckan/ckan/config/solr/schema.xml dest={{ tomcat_m_instances_base_path }}/{{ ckan_solr_port }}/solr/data/solr/collection1/conf/schema.xml state=link force=yes
|
||||||
|
when: not ckan_geonetwork_harvester
|
||||||
|
notify: Solr Restart
|
||||||
|
tags: [ 'ckan', 'solr', 'solr_schema' ]
|
||||||
|
|
||||||
|
- name: Install the solr schema used by CKAN, modified with the spatial fields
|
||||||
|
copy: src=schema.xml dest={{ tomcat_m_instances_base_path }}/{{ ckan_solr_port }}/solr/data/solr/collection1/conf/schema.xml force=yes
|
||||||
|
when: ckan_geonetwork_harvester
|
||||||
|
notify: Solr Restart
|
||||||
|
tags: [ 'ckan', 'solr', 'solr_schema' ]
|
||||||
|
|
||||||
|
- name: Create the base directory for the CKAN file storage
|
||||||
|
file: dest={{ ckan_file_storage_dir }} state=directory owner={{ apache_user }} mode=0700
|
||||||
|
tags: ckan
|
||||||
|
|
||||||
|
- name: Initialize the CKAN databases
|
||||||
|
shell: ckan db init ; ckan datastore set-permissions | su - postgres -c 'psql --set ON_ERROR_STOP=1'
|
||||||
|
when: ( ckan_install | changed )
|
||||||
|
tags: ckan
|
||||||
|
|
||||||
|
- name: Assign the CKAN virtenv dir to the ckan user
|
||||||
|
file: dest={{ ckan_virtenv }} recurse=yes owner={{ ckan_shell_user }} group={{ ckan_shell_user }}
|
||||||
|
tags: [ 'ckan', 'ckan_user' ]
|
||||||
|
|
||||||
|
- name: Create a log directory for the jobs run by the ckan user
|
||||||
|
file: dest=/var/log/ckan state=directory owner={{ ckan_shell_user }} group={{ ckan_shell_user }}
|
||||||
|
tags: [ 'ckan', 'ckan_user' ]
|
||||||
|
|
||||||
|
- name: Install some plugins dependencies inside the CKAN virtualenv
|
||||||
|
become: True
|
||||||
|
become_user: '{{ ckan_shell_user }}'
|
||||||
|
pip: name={{ item }} virtualenv={{ ckan_virtenv }}
|
||||||
|
with_items: '{{ ckan_pip_dependencies }}'
|
||||||
|
when: ckan_geonetwork_harvester
|
||||||
|
tags: [ 'ckan', 'geonetwork', 'ckan_plugins', 'ckan_pip_deps' ]
|
||||||
|
|
||||||
|
- name: Download the CKAN ckanext-harvest plugin
|
||||||
|
become: True
|
||||||
|
become_user: '{{ ckan_shell_user }}'
|
||||||
|
pip: name='{{ ckan_ckanext_harvester_url }}' virtualenv={{ ckan_virtenv }}
|
||||||
|
notify: Restart CKAN
|
||||||
|
when: ckan_geonetwork_harvester
|
||||||
|
register: ckanext_harvest_install
|
||||||
|
tags: [ 'ckan', 'geonetwork', 'ckan_plugins' ]
|
||||||
|
|
||||||
|
- name: Download the CKAN ckanext-harvest requirements
|
||||||
|
become: True
|
||||||
|
become_user: '{{ ckan_shell_user }}'
|
||||||
|
pip: requirements={{ ckan_virtenv }}/src/ckanext-harvest/pip-requirements.txt virtualenv={{ ckan_virtenv }}
|
||||||
|
when: ckan_geonetwork_harvester
|
||||||
|
tags: [ 'ckan', 'geonetwork', 'ckan_plugins' ]
|
||||||
|
|
||||||
|
- name: Initialize the CKAN ckanext-harvest plugin
|
||||||
|
become: True
|
||||||
|
become_user: '{{ ckan_shell_user }}'
|
||||||
|
shell: . /usr/lib/ckan/default/bin/activate ; paster --plugin=ckanext-harvest harvester initdb --config={{ ckan_config_file }}
|
||||||
|
when: ( ckanext_harvest_install | changed )
|
||||||
|
tags: [ 'ckan', 'geonetwork', 'ckan_plugins' ]
|
||||||
|
|
||||||
|
- name: Download the CKAN ckanext-spatial plugin
|
||||||
|
become: True
|
||||||
|
become_user: '{{ ckan_shell_user }}'
|
||||||
|
pip: name='{{ ckan_ckanext_spatial_url }}' virtualenv={{ ckan_virtenv }}
|
||||||
|
notify: Restart CKAN
|
||||||
|
when: ckan_geonetwork_harvester
|
||||||
|
register: ckanext_spatial_install
|
||||||
|
tags: [ 'ckan', 'ckan_spatial', 'ckan_plugins' ]
|
||||||
|
|
||||||
|
- name: Download the CKAN ckanext-spatial requirements
|
||||||
|
become: True
|
||||||
|
become_user: '{{ ckan_shell_user }}'
|
||||||
|
pip: requirements={{ ckan_virtenv }}/src/ckanext-spatial/pip-requirements.txt virtualenv={{ ckan_virtenv }}
|
||||||
|
when: ckan_geonetwork_harvester
|
||||||
|
tags: [ 'ckan', 'ckan_spatial', 'ckan_plugins' ]
|
||||||
|
|
||||||
|
- name: Initialize the CKAN ckanext-harvest plugin
|
||||||
|
become: True
|
||||||
|
become_user: '{{ ckan_shell_user }}'
|
||||||
|
shell: . /usr/lib/ckan/default/bin/activate ; paster --plugin=ckanext-spatial spatial initdb --config={{ ckan_config_file }}
|
||||||
|
when: ( ckanext_spatial_install | changed )
|
||||||
|
tags: [ 'ckan', 'ckan_harvest', 'ckan_plugins' ]
|
||||||
|
|
||||||
|
- name: Download the CKAN Geonetwork plugin code
|
||||||
|
become: True
|
||||||
|
become_user: '{{ ckan_shell_user }}'
|
||||||
|
git: repo={{ ckan_geonetwork_harvester_url }} dest=/usr/lib/ckan/default/src/ckanext-geonetwork
|
||||||
|
when: ckan_geonetwork_harvester
|
||||||
|
register: install_geonetwork_harvester
|
||||||
|
tags: [ 'ckan', 'ckan_geonetwork', 'ckan_plugins' ]
|
||||||
|
|
||||||
|
- name: Install the CKAN Geonetwork plugin code
|
||||||
|
become: True
|
||||||
|
become_user: '{{ ckan_shell_user }}'
|
||||||
|
shell: . /usr/lib/ckan/default/bin/activate ; cd /usr/lib/ckan/default/src/ckanext-geonetwork ; python setup.py develop
|
||||||
|
when: ( install_geonetwork_harvester | changed )
|
||||||
|
notify: Restart CKAN
|
||||||
|
tags: [ 'ckan', 'ckan_geonetwork', 'ckan_plugins' ]
|
||||||
|
|
||||||
|
- name: Install the script that updates the tracking data
|
||||||
|
template: src=tracker_update.sh.j2 dest={{ ckan_virtenv }}/bin/tracker_update owner={{ ckan_shell_user }} group={{ ckan_shell_user }} mode=0555
|
||||||
|
when: ckan_geonetwork_harvester
|
||||||
|
tags: [ 'ckan', 'ckan_geonetwork', 'ckan_plugins', 'tracker' ]
|
||||||
|
|
||||||
|
- name: Install the cron job that runs the tracker update script
|
||||||
|
cron: name="tracker update" minute="0" job="{{ ckan_virtenv }}/bin/tracker_update > /var/log/ckan/tracker_update.log 2>&1" user={{ ckan_shell_user }}
|
||||||
|
when: ckan_geonetwork_harvester
|
||||||
|
tags: [ 'ckan', 'ckan_geonetwork', 'ckan_plugins', 'tracker' ]
|
||||||
|
|
||||||
|
- name: Download the CKAN PDF viewer plugin
|
||||||
|
become: True
|
||||||
|
become_user: '{{ ckan_shell_user }}'
|
||||||
|
pip: name='{{ ckan_ckanext_pdfview_url }}' virtualenv={{ ckan_virtenv }}
|
||||||
|
when: ckan_pdfview
|
||||||
|
notify: Restart CKAN
|
||||||
|
tags: [ 'ckan', 'ckan_pdfview', 'ckan_plugins' ]
|
||||||
|
|
||||||
|
- name: Download the CKAN Privatedatasets extension
|
||||||
|
become: True
|
||||||
|
become_user: '{{ ckan_shell_user }}'
|
||||||
|
pip: name='{{ ckan_privatedatasets_url }}' virtualenv={{ ckan_virtenv }}
|
||||||
|
when: ckan_privatedatasets
|
||||||
|
notify: Restart CKAN
|
||||||
|
tags: [ 'ckan', 'ckan_privdatasets', 'ckan_plugins' ]
|
||||||
|
|
||||||
|
- name: Download the CKAN hierarchy plugin code
|
||||||
|
become: True
|
||||||
|
become_user: '{{ ckan_shell_user }}'
|
||||||
|
pip: name='{{ ckan_hierarchy_url }}' virtualenv={{ ckan_virtenv }}
|
||||||
|
when: ckan_hierarchy
|
||||||
|
notify: Restart CKAN
|
||||||
|
tags: [ 'ckan', 'ckan_hierarchy', 'ckan_plugins' ]
|
||||||
|
|
||||||
|
- name: Download the CKAN pages plugin code
|
||||||
|
become: True
|
||||||
|
become_user: '{{ ckan_shell_user }}'
|
||||||
|
pip: name='{{ ckan_pages_url }}' virtualenv={{ ckan_virtenv }}
|
||||||
|
when: ckan_pages
|
||||||
|
notify: Restart CKAN
|
||||||
|
tags: [ 'ckan', 'ckan_pages', 'ckan_plugins' ]
|
||||||
|
|
||||||
|
- name: Overwrite the base.py ckanext-spatial plugin file to enable authentication against the Geonetwork nodes
|
||||||
|
copy: src=base.py dest=/usr/lib/ckan/default/src/ckanext-spatial/ckanext/spatial/harvesters/base.py owner={{ ckan_shell_user }} group={{ ckan_shell_user }} mode=0644 backup=yes
|
||||||
|
notify: Restart CKAN
|
||||||
|
tags: [ 'ckan', 'ckan_pages', 'ckan_plugins', 'ckan_geo_auth' ]
|
||||||
|
|
||||||
|
- name: Restart apache
|
||||||
|
service: name=apache state=restarted enabled=yes
|
||||||
|
when: ( ckan_install | changed )
|
||||||
|
tags: ckan
|
||||||
|
|
||||||
|
- name: Restart nginx
|
||||||
|
service: name=nginx state=restarted enabled=yes
|
||||||
|
when: ( ckan_install | changed )
|
||||||
|
tags: ckan
|
||||||
|
|
||||||
|
# To create the first sysadmin user:
|
||||||
|
# . /usr/lib/ckan/default/bin/activate
|
||||||
|
# cd /usr/lib/ckan/default/src/ckan
|
||||||
|
# You have to create your first CKAN sysadmin user from the command line. For example, to create a user called seanh and make him a # sysadmin:
|
||||||
|
|
||||||
|
# paster sysadmin add seanh -c /etc/ckan/default/production.ini
|
||||||
|
#
|
||||||
|
# To create some test data:
|
||||||
|
# paster create-test-data -c /etc/ckan/default/production.ini
|
|
@ -0,0 +1,8 @@
|
||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
. {{ ckan_virtenv }}/bin/activate
|
||||||
|
|
||||||
|
paster --plugin=ckan tracking update -c {{ ckan_config_file }}
|
||||||
|
paster --plugin=ckan search-index rebuild -r -c {{ ckan_config_file }}
|
||||||
|
|
||||||
|
exit 0
|
|
@ -0,0 +1,10 @@
|
||||||
|
---
|
||||||
|
- name: Add postgres extensions to the databases, if needed
|
||||||
|
become: True
|
||||||
|
become_user: postgres
|
||||||
|
postgresql_ext: name={{ item.1 }} db={{ item.0.name }} port={{ psql_db_port }}
|
||||||
|
with_subelements:
|
||||||
|
- '{{ psql_db_data | default([]) }}'
|
||||||
|
- extensions
|
||||||
|
tags: [ 'postgresql', 'postgres', 'pg_extensions' ]
|
||||||
|
|
Loading…
Reference in New Issue