python 大文件去重

射满东城湖 / 2024-08-09 / 原文

#!/usr/bin/env python
# -*- coding: utf-8 -*-

def filters(input_file, output_file):
try:
seen_domains = set()
with open(output_file, mode='a+', encoding="utf-8") as yes:
with open(input_file, 'r', encoding="utf-8") as file:
for line in file:
domain = line.strip()
if domain not in seen_domains:
seen_domains.add(domain)
if not domain.endswith(".cn"):
yes.write(domain + "\n")
print(1)
except Exception as e:
print(e)
finally:
print("gg!")

input_file = '去重前.txt'
output_file = '去重后.txt'

filters(input_file, output_file)