Movie_Data_Capture/scraper.py
2023-12-21 18:26:12 +08:00

254 lines
10 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# build-in lib
import json
import secrets
import typing
from pathlib import Path
# third party lib
import opencc
from lxml import etree
# project wide definitions
import config
from ADC_function import (translate,
load_cookies,
file_modification_days,
delete_all_elements_in_str,
delete_all_elements_in_list
)
from scrapinglib.api import search
def get_data_from_json(
file_number: str,
open_cc: opencc.OpenCC,
specified_source: str, specified_url: str) -> typing.Optional[dict]:
"""
iterate through all services and fetch the data 从网站上查询片名解析JSON返回元数据
:param file_number: 影片名称
:param open_cc: 简繁转换器
:param specified_source: 指定的媒体数据源
:param specified_url: 指定的数据查询地址, 目前未使用
:return 给定影片名称的具体信息
"""
try:
actor_mapping_data = etree.parse(str(Path.home() / '.local' / 'share' / 'mdc' / 'mapping_actor.xml'))
info_mapping_data = etree.parse(str(Path.home() / '.local' / 'share' / 'mdc' / 'mapping_info.xml'))
except:
actor_mapping_data = etree.fromstring("<html></html>", etree.HTMLParser())
info_mapping_data = etree.fromstring("<html></html>", etree.HTMLParser())
conf = config.getInstance()
# default fetch order list, from the beginning to the end
sources = conf.sources()
# TODO 准备参数
# - 清理 ADC_function, webcrawler
proxies: dict = None
config_proxy = conf.proxy()
if config_proxy.enable:
proxies = config_proxy.proxies()
ca_cert = None
if conf.cacert_file():
ca_cert = conf.cacert_file()
json_data = search(file_number, sources, proxies=proxies, verify=ca_cert,
morestoryline=conf.is_storyline(),
specifiedSource=specified_source, specifiedUrl=specified_url,
debug = conf.debug())
# Return if data not found in all sources
if not json_data:
print('[-]Movie Number not found!')
return None
# 增加number严格判断避免提交任何number
if str(json_data.get('number')).upper() != file_number.upper():
try:
if json_data.get('allow_number_change'):
pass
except:
print('[-]Movie number has changed! [{}]->[{}]'.format(file_number, str(json_data.get('number'))))
return None
# ================================================网站规则添加结束================================================
if json_data.get('title') == '':
print('[-]Movie Number or Title not found!')
return None
title = json_data.get('title')
actor_list = str(json_data.get('actor')).strip("[ ]").replace("'", '').split(',') # 字符串转列表
actor_list = [actor.strip() for actor in actor_list] # 去除空白
director = json_data.get('director')
release = json_data.get('release')
number = json_data.get('number')
studio = json_data.get('studio')
source = json_data.get('source')
runtime = json_data.get('runtime')
outline = json_data.get('outline')
label = json_data.get('label')
series = json_data.get('series')
year = json_data.get('year')
if json_data.get('cover_small'):
cover_small = json_data.get('cover_small')
else:
cover_small = ''
if json_data.get('trailer'):
trailer = json_data.get('trailer')
else:
trailer = ''
if json_data.get('extrafanart'):
extrafanart = json_data.get('extrafanart')
else:
extrafanart = ''
imagecut = json_data.get('imagecut')
tag = str(json_data.get('tag')).strip("[ ]").replace("'", '').replace(" ", '').split(',') # 字符串转列表 @
while 'XXXX' in tag:
tag.remove('XXXX')
while 'xxx' in tag:
tag.remove('xxx')
actor = str(actor_list).strip("[ ]").replace("'", '').replace(" ", '')
# if imagecut == '3':
# DownloadFileWithFilename()
# ====================处理异常字符====================== #\/:*?"<>|
actor = special_characters_replacement(actor)
actor_list = [special_characters_replacement(a) for a in actor_list]
title = special_characters_replacement(title)
label = special_characters_replacement(label)
outline = special_characters_replacement(outline)
series = special_characters_replacement(series)
studio = special_characters_replacement(studio)
director = special_characters_replacement(director)
tag = [special_characters_replacement(t) for t in tag]
release = release.replace('/', '-')
tmpArr = cover_small.split(',')
if len(tmpArr) > 0:
cover_small = tmpArr[0].strip('\"').strip('\'')
# ====================处理异常字符 END================== #\/:*?"<>|
# 处理大写
if conf.number_uppercase():
json_data['number'] = number.upper()
# 返回处理后的json_data
json_data['title'] = title
json_data['original_title'] = title
json_data['actor'] = actor
json_data['release'] = release
json_data['cover_small'] = cover_small
json_data['tag'] = tag
json_data['year'] = year
json_data['actor_list'] = actor_list
json_data['trailer'] = trailer
json_data['extrafanart'] = extrafanart
json_data['label'] = label
json_data['outline'] = outline
json_data['series'] = series
json_data['studio'] = studio
json_data['director'] = director
if conf.is_translate():
translate_values = conf.translate_values().split(",")
for translate_value in translate_values:
if json_data[translate_value] == "":
continue
if conf.get_translate_engine() == "azure":
t = translate(
json_data[translate_value],
target_language="zh-Hans",
engine=conf.get_translate_engine(),
key=conf.get_translate_key(),
)
else:
if len(json_data[translate_value]):
if type(json_data[translate_value]) == str:
json_data[translate_value] = special_characters_replacement(json_data[translate_value])
json_data[translate_value] = translate(json_data[translate_value])
else:
for i in range(len(json_data[translate_value])):
json_data[translate_value][i] = special_characters_replacement(
json_data[translate_value][i])
list_in_str = ",".join(json_data[translate_value])
json_data[translate_value] = translate(list_in_str).split(',')
if open_cc:
cc_vars = conf.cc_convert_vars().split(",")
ccm = conf.cc_convert_mode()
def convert_list(mapping_data, language, vars):
total = []
for i in vars:
if len(mapping_data.xpath('a[contains(@keyword, $name)]/@' + language, name=f",{i},")) != 0:
i = mapping_data.xpath('a[contains(@keyword, $name)]/@' + language, name=f",{i},")[0]
total.append(i)
return total
def convert(mapping_data, language, vars):
if len(mapping_data.xpath('a[contains(@keyword, $name)]/@' + language, name=vars)) != 0:
return mapping_data.xpath('a[contains(@keyword, $name)]/@' + language, name=vars)[0]
else:
raise IndexError('keyword not found')
for cc in cc_vars:
if json_data[cc] == "" or len(json_data[cc]) == 0:
continue
try:
if ccm == 1:
json_data[cc] = convert(info_mapping_data, "zh_cn", json_data[cc])
json_data[cc] = delete_all_elements_in_str("删除", json_data[cc])
elif ccm == 2:
json_data[cc] = convert(info_mapping_data, "zh_tw", json_data[cc])
json_data[cc] = delete_all_elements_in_str("删除", json_data[cc])
elif ccm == 3:
json_data[cc] = convert(info_mapping_data, "jp", json_data[cc])
json_data[cc] = delete_all_elements_in_str("删除", json_data[cc])
except IndexError:
json_data[cc] = open_cc.convert(json_data[cc])
except:
pass
naming_rule = ""
original_naming_rule = ""
for i in conf.naming_rule().split("+"):
if i not in json_data:
naming_rule += i.strip("'").strip('"')
original_naming_rule += i.strip("'").strip('"')
else:
item = json_data.get(i)
naming_rule += item if type(item) is not list else "&".join(item)
# PATCH处理[title]存在翻译的情况后续NFO文件的original_name只会直接沿用naming_rule,这导致original_name非原始名
# 理应在翻译处处理 naming_rule和original_naming_rule
if i == 'title':
item = json_data.get('original_title')
original_naming_rule += item if type(item) is not list else "&".join(item)
json_data['naming_rule'] = naming_rule
json_data['original_naming_rule'] = original_naming_rule
return json_data
def special_characters_replacement(text) -> str:
if not isinstance(text, str):
return text
return (text.replace('\\', ''). # U+2216 SET MINUS @ Basic Multilingual Plane
replace('/', ''). # U+2215 DIVISION SLASH @ Basic Multilingual Plane
replace(':', ''). # U+A789 MODIFIER LETTER COLON @ Latin Extended-D
replace('*', ''). # U+2217 ASTERISK OPERATOR @ Basic Multilingual Plane
replace('?', ''). # U+FF1F FULLWIDTH QUESTION MARK @ Basic Multilingual Plane
replace('"', ''). # U+FF02 FULLWIDTH QUOTATION MARK @ Basic Multilingual Plane
replace('<', ''). # U+1438 CANADIAN SYLLABICS PA @ Basic Multilingual Plane
replace('>', ''). # U+1433 CANADIAN SYLLABICS PO @ Basic Multilingual Plane
replace('|', 'ǀ'). # U+01C0 LATIN LETTER DENTAL CLICK @ Basic Multilingual Plane
replace('&lsquo;', ''). # U+02018 LEFT SINGLE QUOTATION MARK
replace('&rsquo;', ''). # U+02019 RIGHT SINGLE QUOTATION MARK
replace('&hellip;', '').
replace('&amp;', '').
replace("&", '')
)