Commit 4b7ea744 authored by Jeff Thiessen's avatar Jeff Thiessen

Merge branch 'development' of...

Merge branch 'development' of gitlab-irida.corefacility.ca:irida/irida-miseq-uploader into api_singleton
parents 5b0fc406 cecaaeb4
Pipeline #7240 failed with stage
in 5 minutes and 50 seconds
......@@ -733,13 +733,13 @@ class ApiCalls(object):
logging.info("Finished uploading sequence files for sample [{}]".format(sample.get_id()))
send_message(sample.upload_completed_topic, sample=sample)
else:
err_msg = ("Error {status_code}: {err_msg}\n").format(
e = SequenceFileError("Error {status_code}: {err_msg}\n".format(
status_code=str(response.status_code),
err_msg=response.reason)
err_msg=response.reason))
logging.info("Got an error when uploading [{}]: [{}]".format(sample.get_id(), err_msg))
logging.info(response.text)
send_message(sample.upload_failed_topic, exception = e)
raise SequenceFileError(err_msg, [])
send_message(sample.upload_failed_topic, exception=e)
raise e
return json_res
......
......@@ -86,20 +86,27 @@ def search_for_upload(directory, cond):
.miseqUploaderInfo files.
"""
global toMonitor
for root, dirs, files in os.walk(directory, topdown=True):
for name in dirs:
checkForCompJob = os.path.join(root, name, "CompletedJobInfo.xml")
checkForMiSeq = os.path.join(root, name, ".miseqUploaderInfo")
if os.path.isfile(checkForCompJob):
if not os.path.isfile(checkForMiSeq):
path_to_upload = checkForCompJob
if toMonitor:
on_created(path_to_upload, cond)
# After upload, start back at the start of directories
return
# Check each step of loop if monitoring is still required
if not toMonitor:
if not os.access(directory, os.W_OK):
logging.warning("Could not access directory while monitoring for samples, directory is not writeable {}".format(directory))
return
root = next(os.walk(directory))[0]
dirs = next(os.walk(directory))[1]
for name in dirs:
check_for_comp_job = os.path.join(root, name, "CompletedJobInfo.xml")
check_for_miseq = os.path.join(root, name, ".miseqUploaderInfo")
if os.path.isfile(check_for_comp_job):
if not os.path.isfile(check_for_miseq):
path_to_upload = check_for_comp_job
if toMonitor:
on_created(path_to_upload, cond)
# After upload, start back at the start of directories
return
# Check each step of loop if monitoring is still required
if not toMonitor:
return
return
def stop_monitoring():
......
import json
import logging
from os import path
import os
from Exceptions.SampleSheetError import SampleSheetError
from Exceptions.SequenceFileError import SequenceFileError
from Exceptions.SampleError import SampleError
from Validation.offlineValidation import validate_sample_sheet, validate_sample_list
from Parsers.miseqParser import parse_metadata, complete_parse_samples
from Model.SequencingRun import SequencingRun
from API.fileutils import find_file_by_name
from API.pubsub import send_message
class DirectoryScannerTopics(object):
"""Topics issued by `find_runs_in_directory`"""
finished_run_scan = "finished_run_scan"
......@@ -17,6 +17,7 @@ class DirectoryScannerTopics(object):
garbled_sample_sheet = "garbled_sample_sheet"
missing_files = "missing_files"
def find_runs_in_directory(directory):
"""Find and validate all runs the specified directory.
......@@ -27,43 +28,100 @@ def find_runs_in_directory(directory):
Arguments:
directory -- the directory to find sequencing runs
Usage:
Can be used on the directory containing the SampleSheet.csv file (a single run)
Can be used on the directory containing directories with SampleSheet.csv files in them (a group of runs)
Returns: a list of populated sequencing run objects found
in the directory, ready to be uploaded.
"""
logging.info("looking for sample sheet in {}".format(directory))
sample_sheets = find_file_by_name(directory = directory,
name_pattern = '^SampleSheet.csv$',
depth = 2)
logging.info("found sample sheets: {}".format(", ".join(sample_sheets)))
# filter directories that have been completely uploaded
sheets_to_upload = filter(lambda dir: not run_is_uploaded(path.dirname(dir)), sample_sheets)
logging.info("filtered sample sheets: {}".format(", ".join(sheets_to_upload)))
sequencing_runs = [process_sample_sheet(sheet) for sheet in sheets_to_upload]
def find_run_directory_list(run_dir):
"""Find and return all directories (including this one) in the specified directory.
send_message(DirectoryScannerTopics.finished_run_scan)
Arguments:
directory -- the directory to find directories in
return sequencing_runs
Returns: a list of directories including current directory
"""
def run_is_uploaded(run_directory):
"""Check if a run has already been uploaded.
# Checks if we can access to the given directory, return empty and log a warning if we cannot.
if not os.access(run_dir, os.W_OK):
logging.warning("The following directory is not writeable, "
"can not upload any samples from this directory {}".format(run_dir))
return []
This function checks for the existence of a file `.miseqUploaderInfo`, then
evaluates the status of the run by looking at the "Upload Status" field.
dir_list = next(os.walk(run_dir))[1] # Gets the list of directories in the directory
dir_list.append(run_dir) # Add the current directory to the list too
return dir_list
Arguments:
run_directory -- the sequencing run directory
def dir_has_samples_not_uploaded(sample_dir):
"""Find and validate runs in the specified directory.
Returns: true if the run has already been uploaded, false if it has not.
"""
uploader_info_file = find_file_by_name(run_directory, '.miseqUploaderInfo', depth = 1)
Validates if run already has been uploaded, partially uploaded, or not uploaded
Arguments:
directory -- the directory to find sequencing runs
Returns: Boolean,
True: Directory has samples not uploaded,
Directory has partially uploaded samples
False: Directory has no samples
Directory samples are already uploaded
Directory can not be read, permissions issue
"""
if uploader_info_file:
with open(uploader_info_file[0], "rb") as reader:
info_file = json.load(reader)
return info_file["Upload Status"] == "Complete"
# Checks if we can write to the directory, return false and log a warning if we cannot.
if not os.access(sample_dir, os.W_OK):
logging.warning("The following directory is not writeable, "
"can not upload any samples from this directory {}".format(sample_dir))
return False
file_list = next(os.walk(sample_dir))[2] # Gets the list of files in the directory
if 'SampleSheet.csv' in file_list:
if '.miseqUploaderInfo' in file_list: # Must check status of upload to determine if upload is completed
uploader_info_file = os.path.join(sample_dir, '.miseqUploaderInfo')
with open(uploader_info_file, "rb") as reader:
info_file = json.load(reader)
return info_file["Upload Status"] != "Complete" # if True, has samples, not completed uploading
else: # SampleSheet.csv with no .miseqUploaderInfo file, has samples not uploaded yet
return True
return False # No SampleSheet.csv, does not have samples
logging.info("looking for sample sheet in {}".format(directory))
sample_sheets = []
directory_list = find_run_directory_list(directory)
for d in directory_list:
current_directory = os.path.join(directory, d)
if dir_has_samples_not_uploaded(current_directory):
sample_sheets.append(os.path.join(current_directory, 'SampleSheet.csv'))
logging.info("found sample sheets (filtered): {}".format(", ".join(sample_sheets)))
# Only appending sheets to the list that do not have errors
# The errors are collected to create a list to show the user
sequencing_runs = []
for sheet in sample_sheets:
try:
sequencing_runs.append(process_sample_sheet(sheet))
except SampleSheetError, e:
logging.exception("Failed to parse sample sheet.")
send_message(DirectoryScannerTopics.garbled_sample_sheet, sample_sheet=sheet, error=e)
except SampleError, e:
logging.exception("Failed to parse sample.")
send_message(DirectoryScannerTopics.garbled_sample_sheet, sample_sheet=sheet, error=e)
except SequenceFileError as e:
logging.exception("Failed to find files for sample sheet.")
send_message(DirectoryScannerTopics.missing_files, sample_sheet=sheet, error=e)
send_message(DirectoryScannerTopics.finished_run_scan)
return sequencing_runs
return False
def process_sample_sheet(sample_sheet):
"""Create a SequencingRun object for the specified sample sheet.
......@@ -75,33 +133,23 @@ def process_sample_sheet(sample_sheet):
Returns: an individual SequencingRun object for the sample sheet,
ready to be uploaded.
"""
try:
logging.info("going to parse metadata")
run_metadata = parse_metadata(sample_sheet)
logging.info("going to parse samples")
samples = complete_parse_samples(sample_sheet)
logging.info("going to parse metadata")
run_metadata = parse_metadata(sample_sheet)
logging.info("going to parse samples")
samples = complete_parse_samples(sample_sheet)
logging.info("going to build sequencing run")
sequencing_run = SequencingRun(run_metadata, samples, sample_sheet)
logging.info("going to build sequencing run")
sequencing_run = SequencingRun(run_metadata, samples, sample_sheet)
logging.info("going to validate sequencing run")
validate_run(sequencing_run)
logging.info("going to validate sequencing run")
validate_run(sequencing_run)
send_message(DirectoryScannerTopics.run_discovered, run=sequencing_run)
send_message(DirectoryScannerTopics.run_discovered, run=sequencing_run)
return sequencing_run
except SampleSheetError, e:
logging.exception("Failed to parse sample sheet.")
send_message(DirectoryScannerTopics.garbled_sample_sheet, sample_sheet=sample_sheet, error=e)
except SampleError, e:
logging.exception("Failed to parse sample.")
send_message(DirectoryScannerTopics.garbled_sample_sheet, sample_sheet=sample_sheet, error=e)
except SequenceFileError as e:
logging.exception("Failed to find files for sample sheet.")
send_message(DirectoryScannerTopics.missing_files, sample_sheet=sample_sheet, error=e)
return sequencing_run
return None
def validate_run(sequencing_run):
"""Do the validation on a run, its samples, and files.
......@@ -118,8 +166,10 @@ def validate_run(sequencing_run):
validation = validate_sample_sheet(sequencing_run.sample_sheet)
if not validation.is_valid():
send_message(sequencing_run.offline_validation_topic, run=sequencing_run, errors=validation.get_errors())
raise SampleSheetError('Sample sheet {} is invalid. Reason:\n {}'.format(sample_sheet, validation.get_errors()), validation.error_list())
raise SampleSheetError('Sample sheet {} is invalid. Reason:\n {}'.format(sample_sheet, validation.get_errors()),
validation.error_list())
validation = validate_sample_list(sequencing_run.sample_list)
if not validation.is_valid():
raise SampleError('Sample sheet {} is invalid. Reason:\n {}'.format(sample_sheet, validation.get_errors()), validation.error_list())
raise SampleError('Sample sheet {} is invalid. Reason:\n {}'.format(sample_sheet, validation.get_errors()),
validation.error_list())
import os
import re
def find_file_by_name(directory, name_pattern, depth=-1):
"""Find a file by a name pattern in a directory
Traverse through a directory and a level below it searching for
a file that matches the given SampleSheet pattern.
Arguments:
directory -- top level directory to start searching from
name_pattern -- SampleSheet pattern to try and match
using fnfilter/ fnmatch.filter
depth -- optional, the max depth to descend into the directory. a depth of
-1 implies no limit.
Returns: list containing files that match pattern
"""
if depth == -1:
walk = lambda directory, depth: os.walk(directory)
else:
walk = lambda directory, depth: walklevel(directory, depth)
result_list = []
if os.path.isdir(directory):
for root, dirs, files in walk(directory, depth):
for filename in filter(lambda file: re.search(name_pattern, file), files):
result_list.append(os.path.join(root, filename))
return result_list
def walklevel(directory, depth):
"""Descend into a directory, but only to the specified depth.
This method is gracelessly borrowed from:
http://stackoverflow.com/questions/229186/os-walk-without-digging-into-directories-below
Arguments:
directory -- the directory in which to start the walk
depth -- the depth to descend into the directory.
Returns: a generator for directories in under the top level directory.
"""
directory = directory.rstrip(os.path.sep)
assert os.path.isdir(directory)
num_sep = directory.count(os.path.sep)
for root, dirs, files in os.walk(directory):
yield root, dirs, files
num_sep_this = root.count(os.path.sep)
if num_sep + depth <= num_sep_this:
del dirs[:]
......@@ -83,7 +83,7 @@ class InvalidSampleSheetsPanel(wx.Panel):
if isinstance(error, SampleError):
sheet_errors_type = self._errors_tree.AppendItem(sheet_errors_root, "Error with Sample Data")
elif isinstance(error, SequenceFileError):
sheet_errors_type = self._errors_tree.AppendItem(sheet_errors_root, "Missing FASTQ files")
sheet_errors_type = self._errors_tree.AppendItem(sheet_errors_root, "Missing or Duplicate FASTQ files")
elif isinstance(error, SampleSheetError):
sheet_errors_type = self._errors_tree.AppendItem(sheet_errors_root, "Missing Important Data")
......
import re
from os import path, listdir
from os import path, listdir, walk
from fnmatch import translate as fn_translate
from csv import reader
from collections import OrderedDict
......@@ -11,8 +11,7 @@ from Model.Sample import Sample
from Model.SequenceFile import SequenceFile
from Exceptions.SampleSheetError import SampleSheetError
from Exceptions.SequenceFileError import SequenceFileError
from API.fileutils import find_file_by_name
from API.pubsub import send_message
def parse_metadata(sample_sheet_file):
......@@ -69,7 +68,8 @@ def parse_metadata(sample_sheet_file):
continue
if not section:
raise SampleSheetError("This sample sheet doesn't have any sections.", ["The sample sheet is missing important sections: no sections were found."])
raise SampleSheetError("This sample sheet doesn't have any sections.",
["The sample sheet is missing important sections: no sections were found."])
elif section is "header":
try:
key_name = metadata_key_translation_dict[line[0]]
......@@ -88,7 +88,8 @@ def parse_metadata(sample_sheet_file):
metadata_dict["readLengths"] = max(metadata_dict["readLengths"])
else:
# this is an exceptional case, you can't have no read lengths!
raise SampleSheetError("Sample sheet must have read lengths!", ["The sample sheet is missing important sections: no [Reads] section found."])
raise SampleSheetError("Sample sheet must have read lengths!",
["The sample sheet is missing important sections: no [Reads] section found."])
return metadata_dict
......@@ -110,14 +111,39 @@ def complete_parse_samples(sample_sheet_file):
returns list containing complete Sample objects
"""
def validate_pf_list(file_list):
"""
Checks if list of files is valid:
arguments:
file_list -- list of file names that are grouped together
returns:
True: 1 file in list
True: 2 files in list, where one contains `R1` in the correct position and the other contains `R2`
False: Number of files <1 or >2, or 2 files do not contain `R1`/`R2` correctly
"""
if len(file_list) > 2: # We should never expect more than 2 files, a forward & backwards read
return False
elif len(file_list) == 1: # single read, valid
return True
else:
# check if one file is R1 and other is R2
regex_filter = ".*_S\\d+_L\\d{3}_R(\\d+)_\\S+\\.fastq.*$"
n1 = int(re.search(regex_filter, file_list[0]).group(1))
n2 = int(re.search(regex_filter, file_list[1]).group(1))
return (n1 != n2) and (n1 == 1 or n1 == 2) and (n2 == 1 or n2 == 2)
sample_list = parse_samples(sample_sheet_file)
sample_sheet_dir = path.dirname(sample_sheet_file)
data_dir = path.join(sample_sheet_dir, "Data", "Intensities", "BaseCalls")
data_dir_file_list = next(walk(data_dir))[2] # Create a file list of the data directory, only hit the os once
uploader_info_file = path.join(sample_sheet_dir, ".miseqUploaderInfo")
try:
with open(uploader_info_file, "rb") as reader:
uploader_info = json.load(reader)
with open(uploader_info_file, "rb") as info_reader:
uploader_info = json.load(info_reader)
except IOError:
uploader_info = None
......@@ -125,21 +151,21 @@ def complete_parse_samples(sample_sheet_file):
properties_dict = parse_out_sequence_file(sample)
# this is the Illumina-defined pattern for naming fastq files, from:
# http://blog.basespace.illumina.com/2014/08/18/fastq-upload-in-now-available-in-basespace/
file_pattern = "{sample_name}_S{sample_number}_L\\d{{3}}_R(\\d+)_\\S+\\.fastq.*$".format(sample_name=re.escape(sample.sample_name),
sample_number=sample.sample_number)
file_pattern = "{sample_name}_S{sample_number}_L\\d{{3}}_R(\\d+)_\\S+\\.fastq.*$".format(
sample_name=re.escape(sample.sample_name), sample_number=sample.sample_number)
logging.info("Looking for files with pattern {}".format(file_pattern))
pf_list = find_file_by_name(directory = data_dir,
name_pattern = file_pattern,
depth = 1)
regex = re.compile(file_pattern)
pf_list = filter(regex.search, data_dir_file_list)
if not pf_list:
# OK. So we didn't find any files using the **correct** file name
# definition according to Illumina. Let's try again with our deprecated
# behaviour, where we didn't actually care about the sample number:
file_pattern = "{sample_name}_S\\d+_L\\d{{3}}_R(\\d+)_\\S+\\.fastq.*$".format(sample_name=re.escape(sample.get_id()))
file_pattern = "{sample_name}_S\\d+_L\\d{{3}}_R(\\d+)_\\S+\\.fastq.*$".format(
sample_name=re.escape(sample.get_id()))
logging.info("Looking for files with pattern {}".format(file_pattern))
pf_list = find_file_by_name(directory = data_dir,
name_pattern = file_pattern,
depth = 1)
regex = re.compile(file_pattern)
pf_list = filter(regex.search, data_dir_file_list)
if not pf_list:
# we **still** didn't find anything. It's pretty likely, then that
......@@ -149,10 +175,21 @@ def complete_parse_samples(sample_sheet_file):
("The uploader was unable to find an files with a file name that ends with "
".fastq.gz for the sample in your sample sheet with name {} in the directory {}. "
"This usually happens when the Illumina MiSeq Reporter tool "
"does not generate any FastQ data.")
.format(sample.get_id(), data_dir), ["Sample {}".format(sample.get_id())])
sq = SequenceFile(properties_dict, pf_list)
"does not generate any FastQ data.").format(
sample.get_id(), data_dir), ["Sample {}".format(sample.get_id())])
# List of files may be invalid if directory searching in has been modified by user
if not validate_pf_list(pf_list):
raise SequenceFileError(
("The following file list {} found in the directory {} is invalid. "
"Please verify the folder containing the sequence files matches the SampleSheet file").format(
pf_list, data_dir), ["Sample {}".format(sample.get_id())])
# Add the dir to each file to create the full path
for i in xrange(len(pf_list)):
pf_list[i] = path.join(data_dir, pf_list[i])
sq = SequenceFile(properties_dict, pf_list)
sample.set_seq_file(deepcopy(sq))
if uploader_info is not None:
......@@ -330,8 +367,8 @@ def get_all_fastq_files(data_dir):
try:
file_list = listdir(fastq_files_path)
fastq_file_list = [path.join(fastq_files_path, file)
for file in file_list if re.match(pattern, file)]
fastq_file_list = [path.join(fastq_files_path, f)
for f in file_list if re.match(pattern, f)]
fastq_file_list.sort()
except OSError:
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment