#!/usr/bin/env python3
# delete_duplicates_maildir.py
# 2023-04-29
# by Gernot Walzl
# If `fetchmail --all` is run twice, duplicate messages will be in the mailbox.
# This script cleans the maildir from multiple messages with the same content.
import argparse
import hashlib
import os
class MaildirCleaner:
def __init__(self):
self._messages = {}
@staticmethod
def _read_message_id(filepath_message):
message_id = None
with open(filepath_message, 'rb') as file_msg:
msg_id_next_line = False
for line in file_msg:
if msg_id_next_line:
message_id = line.strip()
elif line.startswith(b'Message-ID:'):
message_id = line[11:].strip()
if not message_id:
msg_id_next_line = True
if message_id:
break
return message_id
def _read_message_ids(self, path):
for filename in sorted(os.listdir(path)):
filepath = os.path.join(path, filename)
msg_id = self._read_message_id(filepath)
if msg_id:
if msg_id not in self._messages:
self._messages[msg_id] = {'filepaths': []}
self._messages[msg_id]['filepaths'].append(filepath)
@staticmethod
def _compute_checksum_content(filepath_message):
sha256 = hashlib.sha256()
with open(filepath_message, 'rb') as file_msg:
lines = file_msg.readlines()
for line in lines[4:]: # timestamp of fetchmail is skipped
sha256.update(line)
return sha256.hexdigest()
def _compute_checksums_duplicates(self):
for msg_id, msg in self._messages.items():
filepaths = msg['filepaths']
if len(filepaths) > 1:
digests = []
for filepath in filepaths:
digests.append(self._compute_checksum_content(filepath))
self._messages[msg_id]['sha256sums'] = digests
def read_maildir(self, maildir):
self._read_message_ids(os.path.join(maildir, "cur"))
self._read_message_ids(os.path.join(maildir, "new"))
self._compute_checksums_duplicates()
@staticmethod
def _are_all_equal(elements):
result = True
for cnt in range(1, len(elements)):
if elements[0] != elements[cnt]:
result = False
break
return result
def print_duplicates(self):
for msg_id, msg in self._messages.items():
filepaths = msg['filepaths']
num_files = len(filepaths)
if num_files > 1:
print("Message-ID: "+msg_id.decode())
sha256sums = msg['sha256sums']
same_content = self._are_all_equal(sha256sums)
for cnt in range(0, num_files):
oper = ' '
if same_content and cnt > 0:
oper = 'D'
print(oper+' '+sha256sums[cnt]+' '+filepaths[cnt])
print()
def count_delete_duplicates(self, perform_delete=False):
num_messages = 0
num_duplicates = 0
for msg in self._messages.values():
filepaths = msg['filepaths']
num_files = len(filepaths)
if num_files > 1:
if self._are_all_equal(msg['sha256sums']):
num_messages += 1
for cnt in range(1, num_files):
num_duplicates += 1
if perform_delete:
os.remove(filepaths[cnt])
return (num_messages, num_duplicates)
def main(maildir):
maildir_cleaner = MaildirCleaner()
maildir_cleaner.read_maildir(maildir)
maildir_cleaner.print_duplicates()
(num_msgs, num_duplicates) = maildir_cleaner.count_delete_duplicates(False)
if num_msgs > 0:
ask = "Do you want to delete {} duplicates of {} messages? (yes/no): "
user_input = input(ask.format(num_duplicates, num_msgs))
if user_input.lower() == 'yes':
maildir_cleaner.count_delete_duplicates(True)
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument("maildir", type=str)
args = parser.parse_args()
main(args.maildir)