
173 lines
6.2 KiB
Raw Permalink Normal View History

2023-09-12 04:16:15 +00:00
#!/usr/bin/env python3
# encoding: utf-8
# This script checks all sound/music files in the repository for whether they've been modified or added without updating the file tracking sound or music copyright
# This check was used in continuous integration for image files as well prior to September 2024.
2023-09-12 04:16:15 +00:00
import argparse
import contextlib
import csv
import hashlib
2023-09-12 04:16:15 +00:00
from operator import itemgetter
import os
from pathlib import Path
from subprocess import check_output
import sys
# csv file layout:
# [0] = current git commit date
# [1] = file path, relative to the repository root
# [2] = license name(s)
# [3] = authorship information
# [4] = notes
# [5] = new git commit date, if different from the value in [0]
# [6] = current md5 hash
# Add new licenses to this list:
# Avoid things like "GNU GPL v2+;CC BY-SA 4.0", unless you mean to dual license
# under either GNU GPL v2+ or CC BY-SA 4.0. GNU GPL v2+ and CC BY-SA 4.0 (e.g.
# a GNU GPL v2+ file with CC BY-SA 4.0 modifications) isn't legally possible.
known_licenses = (
"CC BY-SA 4.0",
"GNU GPL v2+",
2023-09-12 04:16:15 +00:00
def do_git(file):
return str(check_output(["git", "log", "-1", "--format=%ad", "--date=format:%Y/%m/%d", file]), 'UTF-8').rstrip('\n')
def do_hash(file):
md5 = hashlib.md5()
with open(file, 'rb') as f:
while True:
data = f.read(65536)
if not data:
return str(md5.hexdigest())
# program logic start
2023-09-12 04:16:15 +00:00
args = argparse.ArgumentParser()
args.add_argument("--repo", default=".", help="The directory of the Wesnoth repository to run this script against.")
2024-05-13 04:27:04 +00:00
args.add_argument("--output", default="copyrights.csv", help="The file to write the results of this script to.")
2023-09-12 04:16:15 +00:00
args.add_argument("--input", default="copyrights.csv", help="The file to read the existing copyright data from.")
options = args.parse_args()
csv_data = {}
# Too few fields
missing_fields = []
# Too many fields, possibly due to an unquoted comma
extra_fields = []
# New images
2023-09-12 04:16:15 +00:00
added = []
# Changed images
2023-09-12 04:16:15 +00:00
changed = []
# Already mentioned in the CSV file, but lacking something in either the license or author fields
incomplete = []
# Already mentioned in the CSV file, but have something in the needs update field
update = []
2023-09-12 04:16:15 +00:00
unchanged = []
removed = []
# Sanity-check for known licenses
unknown_licenses = []
2023-09-12 04:16:15 +00:00
with open(options.input, encoding="utf-8") as csvfile:
2023-09-12 04:16:15 +00:00
reader = csv.reader(csvfile)
previous_file = ""
2023-09-12 04:16:15 +00:00
for row in reader:
if row[0] == "Date":
file = row[1]
previous_file = file
2023-09-12 04:16:15 +00:00
if not os.path.exists(file):
csv_data[file] = row
2023-09-12 04:16:15 +00:00
2024-05-13 04:27:04 +00:00
with contextlib.suppress(FileNotFoundError):
for root, _, files in os.walk(options.repo):
for filename in files:
filetype = Path(filename).suffix
if filetype == ".wav" or filetype == ".ogg":
file_path = os.path.normpath(os.path.join(root, filename))
if os.path.sep != '/':
# Always use slashes for the file path irrespective of OS used to run the update
file_path = file_path.replace(os.path.sep, '/')
file_hash = do_hash(file_path)
if not file_path in csv_data:
added.append(["", file_path, "", "", "", do_git(file_path), file_hash])
elif len(csv_data[file_path]) < 7:
elif len(csv_data[file_path]) > 7:
elif csv_data[file_path][5] != "":
elif csv_data[file_path][6] != file_hash:
csv_data[file_path][5] = do_git(file_path)
csv_data[file_path][6] = file_hash
elif csv_data[file_path][2].strip() == "" or csv_data[file_path][3].strip() == "":
elif not csv_data[file_path][2] in known_licenses:
2023-09-12 04:16:15 +00:00
final_output = missing_fields + extra_fields + added + changed + incomplete + update + unchanged
2024-05-13 05:09:35 +00:00
2023-09-12 04:16:15 +00:00
if options.output != "":
2024-07-14 21:31:49 +00:00
with open(options.output, 'w', encoding="utf-8") as f:
writer = csv.writer(f, lineterminator="\n")
writer.writerow(["Date", "File", "License", "Author - Real Name(other name);Real Name(other name);etc", "Notes", "Needs Update", "MD5"])
2023-09-12 04:16:15 +00:00
writer = csv.writer(sys.stdout, lineterminator="\n")
2023-09-12 04:16:15 +00:00
any_check_failed = False
2023-09-12 04:16:15 +00:00
if len(removed) > 0:
any_check_failed = True
print("There are "+str(len(removed))+" removed files")
2023-09-12 04:16:15 +00:00
2024-05-13 05:06:08 +00:00
count_missing_fields = len(missing_fields)
count_extra_fields = len(extra_fields)
count_added = len(added)
count_changed = len(changed)
count_incomplete = len(incomplete)
count_update = len(update)
if count_missing_fields > 0 or count_extra_fields > 0 or count_added > 0 or count_changed > 0 or count_incomplete > 0 or count_update > 0:
any_check_failed = True
print("\nThere are "+str(count_missing_fields)+" rows with too few fields:\n"+"\n".join(",".join(a) for a in missing_fields))
print("\nThere are "+str(count_extra_fields)+" rows with too many fields, possibly due to an unquoted comma:\n"+"\n".join(",".join(a) for a in extra_fields))
print("\nThere are "+str(count_added)+" new files:\n"+"\n".join(a[1] for a in added))
print("\nThere are "+str(count_changed)+" changed files:\n"+"\n".join(a[1] for a in changed))
print("\nThere are "+str(count_incomplete)+" files that lack license or author information:\n"+"\n".join(a[1] for a in incomplete))
print("\nThere are "+str(count_update)+" files that need updated information:\n"+"\n".join(a[1] for a in update))
if len(unknown_licenses) > 0:
any_check_failed = True
print("Unknown licenses:")
print(" " + "\n ".join(unknown_licenses))
if any_check_failed:
2023-09-12 04:16:15 +00:00