Skip to content

Commit

Permalink
check paired-end file
Browse files Browse the repository at this point in the history
  • Loading branch information
troublezhang committed May 31, 2016
1 parent d2f2b14 commit d5b54d2
Show file tree
Hide file tree
Showing 6 changed files with 68 additions and 3 deletions.
6 changes: 6 additions & 0 deletions ChangeLog
Original file line number Diff line number Diff line change
@@ -1,3 +1,9 @@
2016-05-31 Yanxiao Zhang <[email protected]>
* version 1.1.8
* added support for paired-end reads (bam and sam)
* v1.1.9
* check if the paired-end files are sorted by the read name.

2016-05-26 Yanxiao Zhang <[email protected]>
* version 1.1.7
* added back the inter-group normalization
Expand Down
2 changes: 1 addition & 1 deletion PePr.egg-info/PKG-INFO
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Metadata-Version: 1.1
Name: PePr
Version: 1.1.8
Version: 1.1.9
Summary: Peak-calling and Prioritization pipeline for replicated ChIP-Seq data
Home-page: https://github.com/shawnzhangyx/PePr/
Author: Yanxiao Zhang
Expand Down
2 changes: 1 addition & 1 deletion PePr/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = '1.1.8'
__version__ = '1.1.9'
61 changes: 60 additions & 1 deletion PePr/classDef.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import os
import logConfig
from logging import info, debug
from logging import info, debug, warning
import collections
import pysam

## global variable read_dict
read_dict = {}
Expand Down Expand Up @@ -53,6 +55,7 @@ def __init__(self, opt):
self.validate_parameters()
self.validate_files()
logConfig.startLog(self.output_directory + self.name)
self.check_file_formats()

# --- initialize logging --- #

Expand Down Expand Up @@ -210,6 +213,62 @@ def validate_files(self):
if os.path.isfile(self.input_directory+filename) is False:
print "File:",self.input_directory+filename, " not found"
exit(1)

@staticmethod
def check_sampe_sorted(filename, input_dir):
with open(input_dir + filename,'r') as infile:
for line in infile:
if not line.startswith("@"):
break
count_list = []
count = 1
pre_name = ''
for idx,line in enumerate(infile):
name = line.strip().split()[0]
if name == pre_name:
count += 1
else:
count_list.append(count)
count = 1
pre_name = name
if idx == 999:
break

count1 = len([i for i in count_list if i==1])
ratio = float(count1)/sum(count_list)
#print filename, ratio
if ratio > 0.8:
warning("%s may not be sorted by read name. Please check.",filename)

@staticmethod
def check_bampe_sorted(filename, input_dir):
infile = pysam.Samfile(input_dir + filename, 'rb')
count_list = []
count = 1
pre_name = ''
for idx,line in enumerate(infile.fetch(until_eof = True)):
if line.query_name == pre_name:
count += 1
else:
count_list.append(count)
count = 1
pre_name = line.query_name
if idx == 999:
break

count1 = len([i for i in count_list if i==1])
ratio = float(count1)/sum(count_list)
#print filename, ratio
if ratio > 0.8:
warning("%s may not be sorted by read name. Please check.",filename)

def check_file_formats(self):
check_name_sorted = {'sampe':self.check_sampe_sorted, 'bampe':self.check_bampe_sorted}
if self.file_format in ['sampe', 'bampe']:
for filename in self.get_filenames():
check_name_sorted[self.file_format](filename, self.input_directory)


def write_parameter_to_file(self):
'''write the current parameters to the files so user can repeat the analysis'''
# check if the file name has already be taken.
Expand Down
Binary file added PePr/pre_processing/.fileParser.py.swp
Binary file not shown.
Binary file added dist/PePr-1.1.9.tar.gz
Binary file not shown.

0 comments on commit d5b54d2

Please sign in to comment.