#!/usr/bin/env python3

# delete_duplicates_maildir.py
# 2023-04-29
# by Gernot Walzl

# If `fetchmail --all` is run twice, duplicate messages will be in the mailbox.
# This script cleans the maildir from multiple messages with the same content.

import argparse
import hashlib
import os

class MaildirCleaner:

    def __init__(self):
        self._messages = {}

    def _read_message_id(filepath_message):
        message_id = None
        with open(filepath_message, 'rb') as file_msg:
            msg_id_next_line = False
            for line in file_msg:
                if msg_id_next_line:
                    message_id = line.strip()
                elif line.startswith(b'Message-ID:'):
                    message_id = line[11:].strip()
                    if not message_id:
                        msg_id_next_line = True
                if message_id:
        return message_id

    def _read_message_ids(self, path):
        for filename in sorted(os.listdir(path)):
            filepath = os.path.join(path, filename)
            msg_id = self._read_message_id(filepath)
            if msg_id:
                if msg_id not in self._messages:
                    self._messages[msg_id] = {'filepaths': []}

    def _compute_checksum_content(filepath_message):
        sha256 = hashlib.sha256()
        with open(filepath_message, 'rb') as file_msg:
            lines = file_msg.readlines()
            for line in lines[4:]:  # timestamp of fetchmail is skipped
        return sha256.hexdigest()

    def _compute_checksums_duplicates(self):
        for msg_id, msg in self._messages.items():
            filepaths = msg['filepaths']
            if len(filepaths) > 1:
                digests = []
                for filepath in filepaths:
                self._messages[msg_id]['sha256sums'] = digests

    def read_maildir(self, maildir):
        self._read_message_ids(os.path.join(maildir, "cur"))
        self._read_message_ids(os.path.join(maildir, "new"))

    def _are_all_equal(elements):
        result = True
        for cnt in range(1, len(elements)):
            if elements[0] != elements[cnt]:
                result = False
        return result

    def print_duplicates(self):
        for msg_id, msg in self._messages.items():
            filepaths = msg['filepaths']
            num_files = len(filepaths)
            if num_files > 1:
                print("Message-ID: "+msg_id.decode())
                sha256sums = msg['sha256sums']
                same_content = self._are_all_equal(sha256sums)
                for cnt in range(0, num_files):
                    oper = ' '
                    if same_content and cnt > 0:
                        oper = 'D'
                    print(oper+'   '+sha256sums[cnt]+'  '+filepaths[cnt])

    def count_delete_duplicates(self, perform_delete=False):
        num_messages = 0
        num_duplicates = 0
        for msg in self._messages.values():
            filepaths = msg['filepaths']
            num_files = len(filepaths)
            if num_files > 1:
                if self._are_all_equal(msg['sha256sums']):
                    num_messages += 1
                    for cnt in range(1, num_files):
                        num_duplicates += 1
                        if perform_delete:
        return (num_messages, num_duplicates)

def main(maildir):
    maildir_cleaner = MaildirCleaner()
    (num_msgs, num_duplicates) = maildir_cleaner.count_delete_duplicates(False)
    if num_msgs > 0:
        ask = "Do you want to delete {} duplicates of {} messages? (yes/no): "
        user_input = input(ask.format(num_duplicates, num_msgs))
        if user_input.lower() == 'yes':

if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument("maildir", type=str)
    args = parser.parse_args()