资源简介
该数据集由一系列邮件组成,适用于测试垃圾邮件过滤系统,请勿用作商业目的。
代码片段和文件信息
#!/usr/bin/python
# FileName: Subsampling.py
# Version 1.0 by Tao Ban 2010.5.26
# This function extract all the contents ie subject and first part from the .eml file
# and store it in a new file with the same name in the dst dir.
import email.parser
import os sys stat
import shutil
def ExtractSubPayload (filename):
‘‘‘ Extract the subject and payload from the .eml file.
‘‘‘
if not os.path.exists(filename): # dest path doesnot exist
print “ERROR: input file does not exist:“ filename
os.exit(1)
fp = open(filename)
msg = email.message_from_file(fp)
payload = msg.get_payload()
if type(payload) == type(list()) :
payload = payload[0] # only use the first part of payload
sub = msg.get(‘subject‘)
sub = str(sub)
if type(payload) != type(‘‘) :
payload = str(payload)
return sub + payload
def ExtractBodyFromDir ( srcdir dstdir ):
‘‘‘Extract the body information from all .eml files in the srcdir and
save the file to the dstdir with the same name.‘‘‘
if not os.path.exists(dstdir): # dest path doesnot exist
os.makedirs(dstdir)
files = os.listdir(srcdir)
for file in files:
srcpath = os.path.join(srcdir file)
dstpath = os.path.join(dstdir file)
src_info = os.stat(srcpath)
if stat.S_ISDIR(src_info.st_mode): # for subfolders recurse
ExtractBodyFromDir(srcpath dstpath)
else: # copy the file
body = ExtractSubPayload (srcpath)
dstfile = open(dstpath ‘w‘)
dstfile.write(body)
dstfile.close()
###################################################################
# main function start here
# srcdir is the directory where the .eml are stored
print ‘Input source directory: ‘ #ask for source and dest dirs
srcdir = raw_input()
if not os.path.exists(srcdir):
print ‘The source directory %s does not exist exit...‘ % (srcdir)
sys.exit()
# dstdir is the directory where the content .eml are stored
print ‘Input destination directory: ‘ #ask for source and dest dirs
dstdir = raw_input()
if not os.path.exists(dstdir):
print ‘The destination directory is newly created.‘
os.makedirs(dstdir)
###################################################################
ExtractBodyFromDir ( srcdir dstdir )
属性 大小 日期 时间 名称
----------- --------- ---------- ----- ----
目录 0 2010-05-28 16:46 CSDMC2010_SPAM\
目录 0 2010-05-28 16:46 CSDMC2010_SPAM\CSDMC2010_SPAM\
文件 2177 2010-05-27 09:28 CSDMC2010_SPAM\CSDMC2010_SPAM\ExtractContent.py
文件 3411 2010-05-27 09:29 CSDMC2010_SPAM\CSDMC2010_SPAM\readme.txt
文件 77886 2010-05-27 06:27 CSDMC2010_SPAM\CSDMC2010_SPAM\SPAMTrain.label
目录 0 2010-05-28 16:47 CSDMC2010_SPAM\CSDMC2010_SPAM\TESTING\
文件 6215 2010-05-27 06:01 CSDMC2010_SPAM\CSDMC2010_SPAM\TESTING\TEST_00000.eml
文件 6484 2010-05-27 06:01 CSDMC2010_SPAM\CSDMC2010_SPAM\TESTING\TEST_00001.eml
文件 7705 2010-05-27 06:01 CSDMC2010_SPAM\CSDMC2010_SPAM\TESTING\TEST_00002.eml
文件 6260 2010-05-27 06:01 CSDMC2010_SPAM\CSDMC2010_SPAM\TESTING\TEST_00003.eml
文件 33094 2010-05-27 06:01 CSDMC2010_SPAM\CSDMC2010_SPAM\TESTING\TEST_00004.eml
文件 49320 2010-05-27 06:01 CSDMC2010_SPAM\CSDMC2010_SPAM\TESTING\TEST_00005.eml
文件 3163 2010-05-27 06:01 CSDMC2010_SPAM\CSDMC2010_SPAM\TESTING\TEST_00006.eml
文件 2519 2010-05-27 06:01 CSDMC2010_SPAM\CSDMC2010_SPAM\TESTING\TEST_00007.eml
文件 30295 2010-05-27 06:01 CSDMC2010_SPAM\CSDMC2010_SPAM\TESTING\TEST_00008.eml
文件 2514 2010-05-27 06:01 CSDMC2010_SPAM\CSDMC2010_SPAM\TESTING\TEST_00009.eml
文件 13698 2010-05-27 06:01 CSDMC2010_SPAM\CSDMC2010_SPAM\TESTING\TEST_00010.eml
文件 5639 2010-05-27 06:01 CSDMC2010_SPAM\CSDMC2010_SPAM\TESTING\TEST_00011.eml
文件 1098 2010-05-27 06:01 CSDMC2010_SPAM\CSDMC2010_SPAM\TESTING\TEST_00012.eml
文件 5555 2010-05-27 06:01 CSDMC2010_SPAM\CSDMC2010_SPAM\TESTING\TEST_00013.eml
文件 6049 2010-05-27 06:01 CSDMC2010_SPAM\CSDMC2010_SPAM\TESTING\TEST_00014.eml
文件 4667 2010-05-27 06:01 CSDMC2010_SPAM\CSDMC2010_SPAM\TESTING\TEST_00015.eml
文件 3945 2010-05-27 06:01 CSDMC2010_SPAM\CSDMC2010_SPAM\TESTING\TEST_00016.eml
文件 7610 2010-05-27 06:01 CSDMC2010_SPAM\CSDMC2010_SPAM\TESTING\TEST_00017.eml
文件 3487 2010-05-27 06:01 CSDMC2010_SPAM\CSDMC2010_SPAM\TESTING\TEST_00018.eml
文件 5110 2010-05-27 06:01 CSDMC2010_SPAM\CSDMC2010_SPAM\TESTING\TEST_00019.eml
文件 5037 2010-05-27 06:01 CSDMC2010_SPAM\CSDMC2010_SPAM\TESTING\TEST_00020.eml
文件 6634 2010-05-27 06:01 CSDMC2010_SPAM\CSDMC2010_SPAM\TESTING\TEST_00021.eml
文件 6406 2010-05-27 06:01 CSDMC2010_SPAM\CSDMC2010_SPAM\TESTING\TEST_00022.eml
文件 2297 2010-05-27 06:01 CSDMC2010_SPAM\CSDMC2010_SPAM\TESTING\TEST_00023.eml
文件 3867 2010-05-27 06:01 CSDMC2010_SPAM\CSDMC2010_SPAM\TESTING\TEST_00024.eml
............此处省略8595个文件信息
相关资源
- IP Remote control software RS-BA1 ICOM 短波电
- 垃圾邮件语料库
- 垃圾邮件分类数据集中英文均有
- 测试垃圾邮件过滤的邮件集
- 开关电源设计(第3版 中英文版 Albr
- 算法实验凸包枚举、Graham-Scan、分治三
- 马的Hamilton周游路线问题国际象棋
- shamir 秘密共享算法
- Chameleon聚类算法的Weka实现
- coursera斯坦福机器学习公开课支持向量
- 基于贝叶斯算法的垃圾邮件过滤技术
- 垃圾邮件数据集
- wowpc.iso.Chameleon_2.4svn_r2884_Enoch_10.13.z
- Tensorflow垃圾邮件分类
- 计算机图形学__Bresenham完整算法_画直
- 垃圾邮件过滤系统的详细设计过程及
- 基于朴素贝叶斯方法的垃圾邮件分类
- 垃圾邮件分类实验数据
- GPSR_KeLiu_SUNY_Binghamton.tgz
- shamirmn门限共享方案
- 简单贝叶斯实现垃圾邮件分类
- 一种基于多贝叶斯算法的垃圾邮件过
- 实现canvas 图片拖拽旋转移动 点击转
- wowpc.iso.Chameleon_2.2svn_r2378_trunk_10.10
- hamcrest1.3完整版
- 基于朴素贝叶斯的垃圾邮件分类
- 17 机器学习案例——基于朴素贝叶斯
- 用Bresenham算法画任意斜率的线
- 朴素贝叶斯算法的邮件数据
- 机器学习朴素贝叶斯垃圾邮件过滤器
评论
共有 条评论