#!/usr/bin/env python3
"""
Script ottimizzato per cercare stringhe in file ZIP contenenti log XML di Windows
Versione specializzata per dump MySQL con log di eventi Windows
"""

import zipfile
import os
import sys
import argparse
from pathlib import Path
import concurrent.futures
import threading
from datetime import datetime
import gc
import signal
import time
import csv
import xml.etree.ElementTree as ET
import re
from collections import defaultdict

# Lock per output thread-safe
print_lock = threading.Lock()
# Contatori globali
processed_count = 0
total_files = 0
# Risultati per CSV
csv_results = []
csv_lock = threading.Lock()

def signal_handler(sig, frame):
    """Gestisce l'interruzione con Ctrl+C"""
    print('\n\nInterruzione richiesta. Uscita...')
    sys.exit(0)

signal.signal(signal.SIGINT, signal_handler)

def update_progress():
    """Aggiorna il progresso della ricerca"""
    with print_lock:
        percent = (processed_count / total_files * 100) if total_files > 0 else 0
        print(f"\rProgresso: {processed_count}/{total_files} ({percent:.1f}%)", end='', flush=True)

def extract_windows_log_info(content):
    """Estrae informazioni specifiche dai log XML di Windows"""
    log_info = {
        'event_ids': [],
        'timestamps': [],
        'levels': [],
        'sources': []
    }

    try:
        # Prova a parsare come XML
        # Windows event logs spesso hanno namespace
        content_str = content.decode('utf-8', errors='ignore')

        # Rimuovi BOM se presente
        if content_str.startswith('\ufeff'):
            content_str = content_str[1:]

        # Cerca pattern comuni nei log di Windows
        # Event ID
        event_ids = re.findall(r'<EventID>(\d+)</EventID>', content_str)
        log_info['event_ids'] = list(set(event_ids))

        # Timestamp
        timestamps = re.findall(r'<TimeCreated SystemTime="([^"]+)"', content_str)
        if timestamps:
            log_info['timestamps'] = [timestamps[0], timestamps[-1]]  # primo e ultimo

        # Level (Error, Warning, Information)
        levels = re.findall(r'<Level>(\d+)</Level>', content_str)
        log_info['levels'] = list(set(levels))

        # Source/Provider
        sources = re.findall(r'<Provider Name="([^"]+)"', content_str)
        log_info['sources'] = list(set(sources))

    except Exception:
        pass

    return log_info

def search_in_file_content(file_handle, search_string, case_sensitive=True, max_size_mb=50, is_xml=False):
    """Cerca una stringa nel contenuto di un file con limite di dimensione e conta occorrenze"""
    try:
        # Controlla la dimensione del file
        file_handle.seek(0, 2)
        size = file_handle.tell()
        file_handle.seek(0)

        # Salta file troppo grandi
        if size > max_size_mb * 1024 * 1024:
            return 0, {}

        # Leggi tutto il contenuto per file XML
        content = file_handle.read()

        # Info aggiuntive per XML
        log_info = {}
        if is_xml:
            log_info = extract_windows_log_info(content)

        # Decodifica e conta occorrenze
        try:
            text_content = content.decode('utf-8', errors='ignore')
            if not case_sensitive:
                text_content = text_content.lower()
                search_str = search_string.lower()
            else:
                search_str = search_string

            occurrences = text_content.count(search_str)

            # Se troviamo occorrenze in un XML, cerca il contesto
            if occurrences > 0 and is_xml:
                # Trova le righe con la stringa per dare contesto
                lines = text_content.split('\n')
                context_lines = []
                for i, line in enumerate(lines):
                    if search_str in line:
                        # Prendi qualche riga di contesto
                        start = max(0, i-1)
                        end = min(len(lines), i+2)
                        context_lines.append(f"Riga {i+1}: {lines[i].strip()[:100]}...")
                        if len(context_lines) >= 3:  # Limita a 3 esempi
                            break
                log_info['context'] = context_lines

            return occurrences, log_info

        except:
            return 0, {}

    except Exception:
        return 0, {}

def search_in_zip_detailed(zip_path, search_string, case_sensitive=True, search_in_filenames=False, max_file_size_mb=50):
    """Cerca in un ZIP con tracking dettagliato dei file interni"""
    detailed_results = []

    try:
        with zipfile.ZipFile(zip_path, 'r') as zip_file:
            file_list = zip_file.namelist()

            for file_name in file_list:
                try:
                    file_info = zip_file.getinfo(file_name)

                    # Salta le directory
                    if file_name.endswith('/'):
                        continue

                    # Determina se è un file XML
                    is_xml = file_name.lower().endswith('.xml')

                    file_occurrences = 0
                    log_info = {}

                    # Cerca nel nome del file se richiesto
                    if search_in_filenames:
                        search_name = file_name if case_sensitive else file_name.lower()
                        search_str = search_string if case_sensitive else search_string.lower()
                        filename_occurrences = search_name.count(search_str)
                        if filename_occurrences > 0:
                            file_occurrences += filename_occurrences

                    # Salta file troppo grandi
                    if file_info.file_size > max_file_size_mb * 1024 * 1024:
                        continue

                    # Salta file ZIP interni
                    if file_name.lower().endswith('.zip'):
                        continue

                    # Cerca nel contenuto
                    with zip_file.open(file_name) as file_handle:
                        content_occurrences, log_info = search_in_file_content(
                            file_handle, search_string, case_sensitive, max_file_size_mb, is_xml
                        )
                        file_occurrences += content_occurrences

                    # Se trovate occorrenze, aggiungi ai risultati
                    if file_occurrences > 0:
                        result = {
                            'zip_file': str(zip_path),
                            'internal_file': file_name,
                            'occurrences': file_occurrences,
                            'file_size': file_info.file_size,
                            'is_xml': is_xml
                        }

                        # Aggiungi info specifiche per log Windows se disponibili
                        if log_info:
                            if log_info.get('event_ids'):
                                result['event_ids'] = ','.join(log_info['event_ids'][:5])  # primi 5
                            if log_info.get('sources'):
                                result['sources'] = ','.join(log_info['sources'][:3])  # primi 3
                            if log_info.get('timestamps'):
                                result['time_range'] = f"{log_info['timestamps'][0]} - {log_info['timestamps'][1]}"
                            if log_info.get('context'):
                                result['context'] = ' | '.join(log_info['context'][:2])

                        detailed_results.append(result)

                except Exception as e:
                    # Ignora file problematici
                    pass

    except zipfile.BadZipFile:
        pass
    except Exception:
        pass

    return detailed_results

def process_zip_file(zip_path, search_string, case_sensitive, search_in_filenames, max_file_size_mb):
    """Processa un singolo file zip"""
    global processed_count

    try:
        results = search_in_zip_detailed(
            zip_path,
            search_string,
            case_sensitive,
            search_in_filenames,
            max_file_size_mb
        )

        if results:
            # Aggiungi risultati per CSV
            with csv_lock:
                csv_results.extend(results)

        # Aggiorna il contatore
        with print_lock:
            processed_count += 1

        # Aggiorna il progresso ogni 10 file
        if processed_count % 10 == 0:
            update_progress()
            # Forza garbage collection ogni 100 file
            if processed_count % 100 == 0:
                gc.collect()

        return len(results) > 0

    except Exception:
        with print_lock:
            processed_count += 1
        return False

def find_zip_files_generator(directory):
    """Genera i file ZIP uno alla volta per risparmiare memoria"""
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.lower().endswith('.zip'):
                yield Path(root) / file

def process_in_batches(zip_files, batch_size, args):
    """Processa i file in batch per evitare sovraccarico di memoria"""
    batch = []
    found_count = 0

    with concurrent.futures.ThreadPoolExecutor(max_workers=args.threads) as executor:
        for zip_file in zip_files:
            batch.append(zip_file)

            if len(batch) >= batch_size:
                futures = []
                for zf in batch:
                    future = executor.submit(
                        process_zip_file,
                        zf,
                        args.search_string,
                        not args.ignore_case,
                        args.filenames,
                        args.max_file_size
                    )
                    futures.append(future)

                for future in concurrent.futures.as_completed(futures):
                    if future.result():
                        found_count += 1

                batch.clear()
                gc.collect()
                time.sleep(0.1)

        # Processa l'ultimo batch
        if batch:
            futures = []
            for zf in batch:
                future = executor.submit(
                    process_zip_file,
                    zf,
                    args.search_string,
                    not args.ignore_case,
                    args.filenames,
                    args.max_file_size
                )
                futures.append(future)

            for future in concurrent.futures.as_completed(futures):
                if future.result():
                    found_count += 1

    return found_count

def main():
    parser = argparse.ArgumentParser(
        description='Cerca stringhe in file ZIP contenenti log XML di Windows',
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
Esempi:
  python %(prog)s "error" /path/to/logs/
  python %(prog)s -i "exception" /path/to/logs/ --extended
  python %(prog)s "EventID>4625" /path/to/logs/ --extended
        """
    )
    parser.add_argument('search_string', help='Stringa da cercare')
    parser.add_argument('path', help='Percorso del file ZIP o directory da analizzare')
    parser.add_argument('-i', '--ignore-case', action='store_true',
                       help='Ricerca case-insensitive')
    parser.add_argument('-n', '--filenames', action='store_true',
                       help='Cerca anche nei nomi dei file')
    parser.add_argument('-t', '--threads', type=int, default=2,
                       help='Numero di thread (default: 2, max: 4)')
    parser.add_argument('-b', '--batch-size', type=int, default=50,
                       help='Dimensione batch (default: 50)')
    parser.add_argument('-m', '--max-file-size', type=int, default=50,
                       help='Max dimensione file in MB (default: 50)')
    parser.add_argument('--extended', action='store_true',
                       help='Output esteso con info sui log Windows')

    args = parser.parse_args()

    # Limita i thread
    args.threads = min(args.threads, 4)

    path = Path(args.path)

    if not path.exists():
        print(f"Errore: Il percorso '{path}' non esiste")
        sys.exit(1)

    # Genera nome file output con data e ora
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    output_filename = f"ricerca_log_{timestamp}.csv"

    print(f"Ricerca di '{args.search_string}' nei log Windows...")
    print(f"Case-sensitive: {'No' if args.ignore_case else 'Sì'}")
    print(f"Cerca nei nomi file: {'Sì' if args.filenames else 'No'}")
    print(f"Output esteso: {'Sì' if args.extended else 'No'}")
    print(f"Thread: {args.threads}")
    print(f"File output: {output_filename}")
    print("-" * 70)

    start_time = datetime.now()

    global total_files, processed_count, csv_results
    processed_count = 0
    csv_results = []

    # Conta i file totali
    if path.is_file() and path.suffix.lower() == '.zip':
        zip_files_list = [path]
        total_files = 1
    else:
        print(f"Scansione directory: {path}")
        zip_files_list = list(find_zip_files_generator(path))
        total_files = len(zip_files_list)
        print(f"Trovati {total_files} file ZIP")

    if not zip_files_list:
        print("Nessun file ZIP trovato")
        sys.exit(0)

    # Processa i file
    print("\nElaborazione in corso...")
    found_count = process_in_batches(zip_files_list, args.batch_size, args)

    # Salva i risultati in CSV
    if csv_results:
        # Determina le colonne in base al tipo di output
        if args.extended:
            fieldnames = [
                'Stringa ricercata', 'File ZIP', 'File interno',
                'Occorrenze', 'Dimensione (KB)', 'Event IDs',
                'Sources', 'Time Range', 'Contesto'
            ]
        else:
            fieldnames = [
                'Stringa ricercata', 'File ZIP', 'File interno',
                'Occorrenze', 'Dimensione (KB)'
            ]

        with open(output_filename, 'w', newline='', encoding='utf-8') as csvfile:
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
            writer.writeheader()

            # Ordina per occorrenze e poi per file
            sorted_results = sorted(csv_results, key=lambda x: (x['occurrences'], x['internal_file']), reverse=True)

            for result in sorted_results:
                row = {
                    'Stringa ricercata': args.search_string,
                    'File ZIP': os.path.basename(result['zip_file']),
                    'File interno': result['internal_file'],
                    'Occorrenze': result['occurrences'],
                    'Dimensione (KB)': round(result['file_size'] / 1024, 2)
                }

                if args.extended:
                    row.update({
                        'Event IDs': result.get('event_ids', ''),
                        'Sources': result.get('sources', ''),
                        'Time Range': result.get('time_range', ''),
                        'Contesto': result.get('context', '')[:100] + '...' if result.get('context', '') else ''
                    })

                writer.writerow(row)

    # Statistiche finali
    end_time = datetime.now()
    duration = end_time - start_time

    print("\n\n" + "=" * 70)
    print(f"Ricerca completata in {duration.total_seconds():.2f} secondi")
    print(f"File ZIP analizzati: {total_files}")
    print(f"File con corrispondenze: {found_count}")

    if csv_results:
        total_occurrences = sum(r['occurrences'] for r in csv_results)
        unique_files = len(set(r['internal_file'] for r in csv_results))

        print(f"\nTotale occorrenze trovate: {total_occurrences}")
        print(f"File unici con corrispondenze: {unique_files}")
        print(f"\nRisultati salvati in: {output_filename}")

        # Mostra top risultati
        print(f"\nTop 10 file con più occorrenze:")
        for i, result in enumerate(sorted_results[:10], 1):
            print(f"  {i}. {result['internal_file']} ({result['occurrences']} occorrenze)")

        # Se extended, mostra statistiche sui log
        if args.extended:
            all_event_ids = []
            all_sources = []
            for r in csv_results:
                if r.get('event_ids'):
                    all_event_ids.extend(r['event_ids'].split(','))
                if r.get('sources'):
                    all_sources.extend(r['sources'].split(','))

            if all_event_ids:
                print(f"\nEvent ID più comuni:")
                event_count = defaultdict(int)
                for eid in all_event_ids:
                    event_count[eid.strip()] += 1
                for eid, count in sorted(event_count.items(), key=lambda x: x[1], reverse=True)[:5]:
                    print(f"  - {eid}: {count} volte")

            if all_sources:
                print(f"\nSources più comuni:")
                source_count = defaultdict(int)
                for src in all_sources:
                    source_count[src.strip()] += 1
                for src, count in sorted(source_count.items(), key=lambda x: x[1], reverse=True)[:5]:
                    print(f"  - {src}: {count} volte")

if __name__ == "__main__":
    main()
