import re
from loguru import logger
from flexget import plugin
from flexget.event import event
logger = logger.bind(name='manipulate')
[docs]
class Manipulate:
r"""The manipulate plugin.
Usage::
manipulate:
- <destination field>:
[find_all]: <boolean>
[phase]: <phase>
[from]: <source field>
[extract]: <regexp>
[separator]: <text>
[replace]:
regexp: <regexp>
format: <regexp>
[remove]: <boolean>
[erase]: <list of regexps>
Example:
.. code:: yaml
manipulate:
- title:
extract: \[\d\d\d\d\](.*)
- title:
erase:
- "^unwanted.noise."
- "^more.advertisement."
"""
schema = {
'type': 'array',
'items': {
'type': 'object',
'additionalProperties': {
'type': 'object',
'properties': {
'phase': {'enum': ['metainfo', 'filter', 'modify']},
'from': {'type': 'string'},
'extract': {'type': 'string', 'format': 'regex'},
'separator': {'type': 'string'},
'remove': {'type': 'boolean'},
'erase': {
'type': 'array',
'items': {'type': 'string', 'format': 'regex'},
},
'find_all': {'type': 'boolean'},
'replace': {
'type': 'object',
'properties': {
'regexp': {'type': 'string', 'format': 'regex'},
'format': {'type': 'string'},
},
'required': ['regexp', 'format'],
'additionalProperties': False,
},
},
'additionalProperties': False,
},
},
}
[docs]
def on_task_start(self, task, config):
"""Separate the config into a dict with a list of jobs per phase.
Allow us to skip phases without any jobs in them.
"""
self.phase_jobs = {'filter': [], 'metainfo': [], 'modify': []}
for item in config:
for item_config in item.values():
# Get the phase specified for this item, or use default of metainfo
phase = item_config.get('phase', 'metainfo')
self.phase_jobs[phase].append(item)
[docs]
@plugin.priority(plugin.PRIORITY_FIRST)
def on_task_filter(self, task, config):
if not self.phase_jobs['filter']:
# return if no jobs for this phase
return
modified = sum(self.process(entry, self.phase_jobs['filter']) for entry in task.entries)
logger.verbose('Modified {} entries.', modified)
[docs]
@plugin.priority(plugin.PRIORITY_FIRST)
def on_task_modify(self, task, config):
if not self.phase_jobs['modify']:
# return if no jobs for this phase
return
modified = sum(self.process(entry, self.phase_jobs['modify']) for entry in task.entries)
logger.verbose('Modified {} entries.', modified)
[docs]
def process(self, entry, jobs):
"""Process given jobs from config for an entry.
:param entry: Entry to modify
:param jobs: Config items to run on this entry
:return: True if any fields were modified
"""
modified = False
for item in jobs:
for field, config in item.items():
from_field = field
if 'from' in config:
from_field = config['from']
field_value = entry.get(from_field)
logger.debug(
'field: `{}` from_field: `{}` field_value: `{}`',
field,
from_field,
field_value,
)
if config.get('remove'):
# Remove entire field
if field in entry:
del entry[field]
modified = True
continue
if config.get('erase'):
# Erase text matching regex patterns
if not field_value:
logger.warning(
'Cannot erase patterns, field `{}` is not present', from_field
)
continue
original_value = field_value
for pattern in config['erase']:
field_value = re.sub(
pattern, '', field_value, flags=re.IGNORECASE | re.UNICODE
)
field_value = field_value.strip()
if original_value != field_value:
logger.debug('field `{}` after erase patterns: `{}`', field, field_value)
# Fail entry if title field becomes empty after erase
if field == 'title' and not field_value:
entry.fail('Title became empty after erase operation')
if 'extract' in config:
if not field_value:
logger.warning('Cannot extract, field `{}` is not present', from_field)
continue
if config.get('find_all'):
match = re.findall(
config['extract'], field_value, re.IGNORECASE | re.UNICODE
)
logger.debug('all matches: {}', match)
field_value = config.get('separator', ' ').join(match).strip()
logger.debug('field `{}` after extract: `{}`', field, field_value)
else:
match = re.search(
config['extract'], field_value, re.IGNORECASE | re.UNICODE
)
if match:
groups = [x for x in match.groups() if x is not None]
logger.debug('groups: {}', groups)
field_value = config.get('separator', ' ').join(groups).strip()
logger.debug('field `{}` after extract: `{}`', field, field_value)
if 'replace' in config:
if not field_value:
logger.warning('Cannot replace, field `{}` is not present', from_field)
continue
replace_config = config['replace']
regexp = re.compile(replace_config['regexp'], flags=re.IGNORECASE | re.UNICODE)
field_value = regexp.sub(replace_config['format'], field_value).strip()
logger.debug('field `{}` after replace: `{}`', field, field_value)
if from_field != field or entry[field] != field_value:
logger.verbose('Field `{}` is now `{}`', field, field_value)
modified = True
entry[field] = field_value
return modified
[docs]
@event('plugin.register')
def register_plugin():
plugin.register(Manipulate, 'manipulate', api_ver=2)