Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

A more compact and useful representation for unphased data #21

Merged
merged 6 commits into from
Dec 10, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion igdtools/igdtools.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -246,8 +246,8 @@ int main(int argc, char* argv[]) {
sampleList.resize(j);
}
}
assert(sampleCt <= effectiveSampleCt);
const auto sampleCt = sampleList.size();
assert(sampleCt <= effectiveSampleCt);
const double freq = (double)sampleCt / (double)effectiveSampleCt;
if (freq < fLower || freq >= fUpper) {
continue;
dcdehaas marked this conversation as resolved.
Show resolved Hide resolved
Expand Down
179 changes: 135 additions & 44 deletions picovcf.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1312,12 +1312,25 @@ class IGDData {
// so we can't rely on them for a consistent serialization format. Instead we
// "steal" the upper byte of the bpPosition to store whether or not the row at
// the given position is sparse.
enum {
BP_POS_FLAGS_MASK = 0xFF00000000000000,
static constexpr uint64_t BP_POS_FLAGS_MASK = 0xFF00000000000000;
static constexpr uint64_t BP_POS_FLAGS_SPARSE = 0x0100000000000000;
static constexpr uint64_t BP_POS_FLAGS_IS_MISSING = 0x0200000000000000;

BP_POS_FLAGS_SPARSE = 0x0100000000000000,
BP_POS_FLAGS_IS_MISSING = 0x0200000000000000,
};
// We steal the next byte for indicating how many copies of the alternate allele
// are represented by the current variant. For example, a value of 2 for alt allele
// 'A' and ref allele 'T' means that the diploid has AA. A value of 1 means AT or
// TA (order is irrelevant), and the absence of the sample in any list means TT.
static constexpr uint64_t BP_POS_COPY_SHIFT = 48;
static constexpr uint64_t BP_POS_COPY_MASK = ((uint64_t)0xFF << BP_POS_COPY_SHIFT);

// The mask to get only the position value.
static constexpr uint64_t BP_POS_ONLY_MASK = ~(BP_POS_FLAGS_MASK | BP_POS_COPY_MASK);

// The maximum value of a position, (2^48)-1
static constexpr uint64_t MAX_BP_POSITION = 0x0000FFFFFFFFFFFF;

static_assert(MAX_BP_POSITION == BP_POS_ONLY_MASK, "Unexpected BP_POS mask");
static_assert((BP_POS_FLAGS_MASK & BP_POS_COPY_MASK) == 0, "Overlapping masks");

struct IndexEntry {
uint64_t bpPosition; // The base-pair position of the variant.
Expand Down Expand Up @@ -1353,9 +1366,11 @@ class IGDData {

/**
* The number of variants in this file. Note that variants are binary in IGD,
* so this may not be the same number of variants that the corresponding VCF
* file would have (multiple alternate alleles get converted into their own
* variant).
* so this is not necessarily the same as the number of polymorphic sites (i.e.,
* rows in a VCF file). Also for unphased data, each number of copies that is
* stored is considered a variant, so e.g., for diploid datasets Aa and AA are
* considered separate variants.
*
* @return Number of variants (rows of genotype data).
*/
uint64_t numVariants() const { return m_header.numVariants; }
Expand Down Expand Up @@ -1412,15 +1427,37 @@ class IGDData {
* Get the position of the given variant.
* @param[in] variantIndex The 0-based index of the variant, i.e. the row
* number.
* @param[out] isMissing Will be set to true if this variant represents
* missing data.
* @param[out] numCopies Will be set to the number of copies of the alternate
* allele that this variant represents.
* @return The position of the variant on the genome.
*/
uint64_t getPosition(VariantT variantIndex, bool& isMissing, uint8_t& numCopies) {
const size_t offset = getVariantIndexOffset(variantIndex);
m_infile.seekg(offset);
PICOVCF_GOOD_OR_API_MISUSE(m_infile);
const uint64_t position = readScalar<uint64_t>(m_infile);
isMissing = (bool)(position & BP_POS_FLAGS_IS_MISSING);
numCopies = (uint8_t)(position >> BP_POS_COPY_SHIFT);
return position & BP_POS_ONLY_MASK;
}

/**
* Get the position of the given variant.
* @param[in] variantIndex The 0-based index of the variant, i.e. the row
* number.
* @param[out] isMissing Will be set to true if this variant represents
* missing data.
* @return The position of the variant on the genome.
*/
uint64_t getPosition(VariantT variantIndex, bool& isMissing) {
const size_t offset = getVariantIndexOffset(variantIndex);
m_infile.seekg(offset);
PICOVCF_GOOD_OR_API_MISUSE(m_infile);
uint64_t position = readScalar<uint64_t>(m_infile);
const uint64_t position = readScalar<uint64_t>(m_infile);
isMissing = (bool)(position & BP_POS_FLAGS_IS_MISSING);
return (position & ~BP_POS_FLAGS_MASK);
return position & BP_POS_ONLY_MASK;
}

/**
Expand Down Expand Up @@ -1499,13 +1536,15 @@ class IGDData {
}
return std::move(sampleList);
} else {
const SampleT readAmount = picovcf_div_ceiling<SampleT, 8>(numSamples());
const bool phased = (bool)(m_header.flags & IGD_PHASED);
const SampleT numSamples = phased ? (m_header.ploidy * m_header.numIndividuals) : m_header.numIndividuals;
const SampleT readAmount = picovcf_div_ceiling<SampleT, 8>(numSamples);
PICOVCF_RELEASE_ASSERT(readAmount > 0);
std::unique_ptr<uint8_t> buffer(new uint8_t[readAmount]);
if (buffer) {
m_infile.read(reinterpret_cast<char*>(buffer.get()), readAmount);
PICOVCF_GOOD_OR_API_MISUSE(m_infile);
return ::picovcf::getSamplesWithAlt((const uint8_t*)buffer.get(), numSamples());
return ::picovcf::getSamplesWithAlt((const uint8_t*)buffer.get(), numSamples);
}
}
return {};
Expand Down Expand Up @@ -1701,35 +1740,47 @@ class IGDWriter {
}

/**
* Write a multiple rows of sample/genotype data that correspond to a single
* genome position.
* Write a single variant (row) of sample/genotype data.
*
* This can be used for phased or unphased data: when phased the sample list always
* consists of haploid sample indexes, when unphased the sample list consists of
* individual indexes. For the former, indexes are 0 <= index < (ploidy*numIndividuals)
* and for the latter 0 <= index < numIndividuals.
*
* @param[in] outStream The output stream.
* @param[in] genomePosition The position of the variant on the genome.
* @param[in] referenceAllele The string representing the reference allele.
* @param[in] altAlleles The vector of strings representing the alternate
* alleles. Must be at least 1 alternate allele.
* @param[in] alleleIndexes A vector where the position in the vector
* corresponds to the sample index (so, position 0 is the 0th chromosome copy
* of the 0th individual). Each value in the vector is 0-altAlleles.size(),
* where 0 means reference allele and each non-zero value matches the
* altAllele at (value - 1) in the altAlleles vector. If data is missing for
* a particular sample, then use picovcf::MISSING_VALUE at that index.
* alleles. Must be at least 1 alternate allele.
* @param[in] sampleList A list of sample indexes. For phased diploid data, e.g.,
* index 0 is the 0th chromosome copy of the 0th individual, index 1
* is the 1st chromosome copy of the 0th individual, index 2 is the 0th
* chromosome copy of the 1st individual, etc. For unphased data, index 0
* is the 0th individual, index 1 is the 1st individual, etc.
* @param[in] numCopes [Optional] Required only for unphased data. The number of
* copies of the alternate allele this variant represents, 0 < copies <= ploidy.
* @param[in] isMissing [Optional] when set to true, this row represents all the
* samples that have missing data at the given polymorphic site.
*/
void writeVariantSamples(std::ostream& outStream,
const uint64_t genomePosition,
const std::string& referenceAllele,
const std::string& altAllele,
const IGDSampleList& sampleList,
const bool isMissing = false) {
const SampleT numSamples = (m_header.ploidy * m_header.numIndividuals);
const bool isMissing = false,
const uint8_t numCopies = 0) {
const bool phased = (bool)(m_header.flags & IGDData::IGD_PHASED);
const SampleT numSamples = phased ? (m_header.ploidy * m_header.numIndividuals) : m_header.numIndividuals;
const VariantT variantIndex = m_header.numVariants;
assert(sampleList.size() <= numSamples);
assert(phased || numCopies > 0);
assert(phased || numCopies <= m_header.ploidy);

m_referenceAlleles.emplace_back(referenceAllele);
m_alternateAlleles.emplace_back(altAllele);

const bool isSparse = (sampleList.size() <= (numSamples / IGDData::DEFAULT_SPARSE_THRESHOLD));
m_index.push_back(std::move(makeEntry(genomePosition, isSparse, isMissing, outStream.tellp())));
m_index.push_back(std::move(makeEntry(genomePosition, isSparse, isMissing, numCopies, outStream.tellp())));
if (isSparse) {
writeSparse(outStream, sampleList);
m_sparseCount++;
Expand Down Expand Up @@ -1835,12 +1886,18 @@ class IGDWriter {
}
}

IGDData::IndexEntry
makeEntry(VariantT genomePosition, const bool isSparse, const bool isMissing, std::streamoff filePosition) {
IGDData::IndexEntry makeEntry(VariantT genomePosition,
const bool isSparse,
const bool isMissing,
uint8_t numCopies,
std::streamoff filePosition) {
uint64_t position = static_cast<uint64_t>(genomePosition);
if ((IGDData::BP_POS_FLAGS_MASK & position) != 0) {
if (position > IGDData::MAX_BP_POSITION) {
PICOVCF_THROW_ERROR(ApiMisuse, "Given genome position is too large: " << genomePosition);
}
if (numCopies > 0) {
position |= ((uint64_t)numCopies) << IGDData::BP_POS_COPY_SHIFT;
}
if (isSparse) {
position |= IGDData::BP_POS_FLAGS_SPARSE;
}
Expand Down Expand Up @@ -1900,39 +1957,73 @@ inline void vcfToIGD(const std::string& vcfFilename,
IndividualIteratorGT individualIt = variant.getIndividualIterator();
auto altAlleles = variant.getAltAlleles();
IGDSampleList missingData;
std::vector<IGDSampleList> variantGtData(altAlleles.size());
const size_t numSampleLists = isPhased ? altAlleles.size() : (altAlleles.size() * ploidy);
std::vector<IGDSampleList> variantGtData(numSampleLists);
SampleT sampleIndex = 0;
while (individualIt.hasNext()) {
const bool isPhasedI = individualIt.getAlleles(allele1, allele2);
PICOVCF_ASSERT_OR_MALFORMED(isPhasedI == isPhased, "Cannot convert VCF with mixed phasedness");
const uint64_t ploidyI = (allele2 == NOT_DIPLOID) ? 1 : 2;
PICOVCF_ASSERT_OR_MALFORMED(ploidyI == ploidy, "Cannot convert VCF with mixed ploidy");
if (allele1 == MISSING_VALUE) {
missingData.push_back(sampleIndex);
} else if (allele1 > 0) {
variantGtData.at(allele1 - 1).push_back(sampleIndex);
}
sampleIndex++;
if (ploidy == 2) {
if (allele2 == MISSING_VALUE) {
if (isPhased) {
if (allele1 == MISSING_VALUE) {
missingData.push_back(sampleIndex);
} else if (allele2 > 0) {
variantGtData.at(allele2 - 1).push_back(sampleIndex);
} else if (allele1 > 0) {
variantGtData.at(allele1 - 1).push_back(sampleIndex);
}
sampleIndex++;
if (ploidy == 2) {
if (allele2 == MISSING_VALUE) {
missingData.push_back(sampleIndex);
} else if (allele2 > 0) {
variantGtData.at(allele2 - 1).push_back(sampleIndex);
}
sampleIndex++;
}
// sampleIndex refers to _haploid samples_ for unphased data.
} else {
if (allele1 == MISSING_VALUE || allele2 == MISSING_VALUE) {
missingData.push_back(sampleIndex);
} else if (allele1 == allele2 && allele1 > 0) {
// This corresponds to a "2" (numCopies)
variantGtData.at((allele1 - 1) + altAlleles.size()).push_back(sampleIndex);
} else {
// These correspond to a "1" (numCopies). We can have two "1"s in the case where the
// alt alleles are different. It is up to the downstream use-case how to handle this
// (e.g., filter the lower frequency one out, adjust the calculation to handle it, etc.)
if (allele1 > 0) {
variantGtData.at(allele1 - 1).push_back(sampleIndex);
}
if (allele2 > 0) {
variantGtData.at(allele2 - 1).push_back(sampleIndex);
}
}
sampleIndex++; // sampleIndex refers to _individuals_ for unphased data.
}
}
std::string currentVariantId;
if (emitVariantIds) {
currentVariantId = variant.getID();
}
for (size_t altIndex = 0; altIndex < altAlleles.size(); altIndex++) {
writer.writeVariantSamples(outFile,
variant.getPosition(),
variant.getRefAllele(),
altAlleles[altIndex],
variantGtData[altIndex],
false);
const auto position = variant.getPosition();
const auto& ref = variant.getRefAllele();
const auto& alt = altAlleles[altIndex];
const uint8_t numCopies = isPhased ? 0 : 1;
writer.writeVariantSamples(outFile, position, ref, alt, variantGtData[altIndex], false, numCopies);
if (!isPhased && ploidy > 1) {
const size_t copies2Index = altIndex + altAlleles.size();
const auto& sampleList = variantGtData[copies2Index];
if (!sampleList.empty()) {
writer.writeVariantSamples(outFile,
position,
ref,
alt,
sampleList,
false,
/*numCopies=*/2);
}
}
if (emitVariantIds) {
variantIds.emplace_back(currentVariantId);
}
Expand Down
10 changes: 10 additions & 0 deletions test/example_vcfs/unphased.example.vcf

Large diffs are not rendered by default.

31 changes: 31 additions & 0 deletions test/test_igd.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
using namespace picovcf;

extern const std::string getMISSING_DATA_EXAMPLE_FILE();
extern const std::string getUNPHASED_DATA_EXAMPLE_FILE();

TEST(IGD, MissingData) {
std::string igdFileName = "missing_data.igd";
Expand All @@ -27,3 +28,33 @@ TEST(IGD, MissingData) {
// Every variant is missing
ASSERT_EQ(numMissing, 5);
}

TEST(IGD, UnphasedData) {
std::string igdFileName = "unphased_data.igd";
vcfToIGD(getUNPHASED_DATA_EXAMPLE_FILE(), igdFileName);

IGDData igdFile(igdFileName);

ASSERT_EQ(igdFile.numVariants(), 7);
size_t totalOnes = 0;
size_t totalTwos = 0;
for (size_t i = 0; i < igdFile.numVariants(); i++) {
bool isMissing = false;
uint8_t numCopies = 0;
igdFile.getPosition(i, isMissing, numCopies);
ASSERT_GT(numCopies, 0);
ASSERT_LE(numCopies, 2);
ASSERT_FALSE(isMissing);
auto sampleList = igdFile.getSamplesWithAlt(i);
for (const auto index : sampleList) {
ASSERT_LT(index, igdFile.numIndividuals());
}
if (numCopies == 1) {
totalOnes += sampleList.size();
} else if (numCopies == 2) {
totalTwos += sampleList.size();
}
}
ASSERT_EQ(totalOnes, 90+325+146+129);
ASSERT_EQ(totalTwos, 2+2+1);
}
4 changes: 4 additions & 0 deletions test/test_main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,10 @@ const std::string getMISSING_DATA_EXAMPLE_FILE() {
return getExampleDir() + "/missing.vcf";
}

const std::string getUNPHASED_DATA_EXAMPLE_FILE() {
return getExampleDir() + "/unphased.example.vcf";
}

int main(int argc, char **argv) {
::testing::InitGoogleTest(&argc, argv);
return RUN_ALL_TESTS();
Expand Down
Loading