Source code for flexget.utils.parsers.movie
import re
from datetime import datetime
from loguru import logger
from flexget.utils import qualities
from flexget.utils.parsers.parser import TitleParser
from flexget.utils.tools import str_to_int
logger = logger.bind(name='movieparser')
[docs]
def diff_pos(string1, string2):
"""Return first position where string1 and string2 differ."""
for count, c in enumerate(string1):
if len(string2) <= count:
return count
if string2[count] != c:
return count
return None
[docs]
class MovieParser(TitleParser):
def __init__(self):
self.data = None
self.reset()
TitleParser.__init__(self)
[docs]
def reset(self):
# parsing results
self.name = None
self.year = None
self.year_pos = None
self.quality = qualities.Quality()
self.proper_count = 0
def __str__(self):
return f'<MovieParser(name={self.name},year={self.year},quality={self.quality})>'
[docs]
def parse(self, data=None):
"""Parse movie name. Populates name, year, quality and proper_count attributes."""
# Reset before parsing, so the parser can be reused.
self.reset()
if data is None:
data = self.data
# Move anything in leading brackets to the end
data = re.sub(r'^\[(.*?)\](.*)', r'\2 \1', data)
for char in '[]()_,.':
data = data.replace(char, ' ')
# if there are no spaces
if data.find(' ') == -1:
data = data.replace('-', ' ')
# remove unwanted words (imax, ..)
self.remove_words(data, self.remove)
data = self.strip_spaces(data)
# split to parts
parts = data.split(' ')
cut_part = 256
all_caps = True
for part_pos, part in enumerate(parts):
cut = False
# Don't let the first word be cutoff word
if part_pos < 1:
continue
# check for year
num = str_to_int(part)
if num is not None and 1930 < num <= datetime.now().year:
if self.year_pos == cut_part:
# Looks like a year, but we already set the cutpoint to a year, let's move it forward
cut_part = part_pos
self.year = num
self.year_pos = part_pos
cut = True
# Don't consider all caps words cut words if the whole title has been all caps
if not part.isupper():
all_caps = False
# if length > 3 and whole word in uppers, consider as cut word (most likely a group name)
if len(part) > 3 and part.isupper() and part.isalpha() and not all_caps:
cut = True
# check for cutoff words
if part.lower() in self.cutoffs:
cut = True
# check for propers, 'real' and 'final' are too common in movie parsers, only cut if it comes after year
if (
part.lower() in self.propers and part.lower() not in ['real', 'final']
) or self.year:
self.proper_count += 1
cut = True
# update cut position
if cut and parts.index(part) < cut_part:
cut_part = part_pos
if cut_part != 256:
logger.debug('parts: {}, cut is: {}', parts, parts[cut_part])
# calculate cut position from cut_part
abs_cut = len(' '.join(parts[:cut_part]))
logger.debug(
'after parts check, cut data would be: `{}` abs_cut: {}', data[:abs_cut], abs_cut
)
# parse quality
quality = qualities.Quality(data)
if quality:
self.quality = quality
# remaining string is same as data but quality information removed
# find out position where there is first difference, this is earliest
# quality bit, anything after that has no relevance to the movie name
dp = diff_pos(data, quality.clean_text)
if dp is not None:
logger.debug('quality start: {}', dp)
if dp < abs_cut:
logger.debug('quality cut is even shorter')
abs_cut = dp
# make cut
data = data[:abs_cut].strip()
logger.debug('data cut to `{}` - this will be the name', data)
# save results
self.name = data