Skip to content

Commit

Permalink
created the load_ipums_extract() function, test cases, etc. (#21)
Browse files Browse the repository at this point in the history
  • Loading branch information
00krishna authored May 6, 2024
1 parent 268acea commit 1e97b67
Show file tree
Hide file tree
Showing 4 changed files with 7,846 additions and 2 deletions.
1 change: 1 addition & 0 deletions src/IPUMS.jl
Original file line number Diff line number Diff line change
Expand Up @@ -59,4 +59,5 @@ module IPUMS
export IPUMSAPI
export parse_ddi
export extract_download
export load_ipums_extract
end
175 changes: 173 additions & 2 deletions src/parsers/ddi_parser.jl
Original file line number Diff line number Diff line change
Expand Up @@ -46,9 +46,9 @@ function parse_ddi(filepath::String)
# get variables metadata from file
_get_var_metadata_from_ddi!(ddi)

return ddi
return ddi;

end
end;


"""
Expand Down Expand Up @@ -80,6 +80,35 @@ function _check_that_file_is_xml(filepath::String)
end


"""
_check_that_file_is_dat(filepath::String)
This is an internal function and checks whether the provided file is a DAT
file. All IPUMS extract data files should be in DAT format.
### Arguments
- `filepath::String` - A file path that the user wishes to import. The file must be
a DAT file.
### Returns
The function returns nothing if the file is a DAT file. If the file is not
a DAT file, then the function raises an `ArgumentError`.
"""
function _check_that_file_is_dat(filepath::String)

extension = filepath[findlast(==('.'), filepath)+1:end]

if extension != "dat"
throw(ArgumentError("The IPUMS Extract File: $filepath should be an DAT file."))
else
return true
end

end


"""
_check_that_file_exists(filepath::String)
Expand Down Expand Up @@ -308,3 +337,145 @@ function _string_to_num(x::SubString{String})
return parse(Int64, n)
end


"""
load_ipums_extract(ddi::DDIInfo, extract_filepath::String)
This file will take in a parsed DDIInfo object and file path to an IPUMS
DAT extract file, and returns a dataframe containing all of the data.
### Arguments
- `ddi::DDIInfo` - A DDIInfo object, which is the result of parsing a DDI metadata file.
- `extract_filepath::String` - The directory path to an IPUMS extract DAT file.
### Returns
This function outputs a Julia Dataframe that contains all of the data from
the IPUMS extract file. Further, the metadata fields of the Dataframe
contain the metadata parsed from the DDI file.
# Examples
Let's assume we have an extract DDI file named `my_extract.xml`, and an extract
file called `my_extract.dat`.
```julia-repl
julia> ddi = parse_ddi("my_extract.xml");
julia> df = load_ipums_extract(ddi, "my_extract.dat");
```
"""
function load_ipums_extract(ddi::DDIInfo, extract_filepath::String)

# Check that the extract file exists and is a DAT file.
_check_that_file_is_dat(extract_filepath)
_check_that_file_exists(extract_filepath)

# Load ddi column-level metadata to local variables
name_vec = [v.name for v in ddi.variable_info]
range_vec = [v.position_start:v.position_end for v in ddi.variable_info]
dtype_vec = [v.var_dtype[] for v in ddi.variable_info]
dcml_vec = [v.dcml for v in ddi.variable_info]

# Create empty dataframe

df = DataFrame(dtype_vec, name_vec)

# Save extract level metadata to dataframe. This data applies to the entire
# file.

metadata!(df, "conditions", ddi.conditions, style=:note);
metadata!(df, "citation", ddi.citation, style=:note);
metadata!(df, "ipums_project", ddi.ipums_project, style=:note);
metadata!(df, "extract_notes", ddi.extract_notes, style=:note);
metadata!(df, "extract_date", ddi.extract_date, style=:note);

# Setup and write column level metadata to dataframe.
fields = Dict("label" => :labl,
"description" => :desc,
"data type" => :var_dtype,
"numeric type" => :var_interval,
"coding instructions" => :coder_instructions,
"category labels" => :category_labels)

# Iterate over each column in the dataset and save the corresponding metadata
# dictionary for that column to the dataframe.
for i in eachindex(ddi.variable_info)
for (k,v) in fields
colmetadata!(df,
ddi.variable_info[i].name,
k,
getfield(ddi.variable_info[i], v),
style=:note);
end
end

# Load extract data to the dataframe. The original ipums extract is a
# fixed width file with no delimiters. The original file saves float-valued
# variables as integers, hence we must to parse the file to correctly
# extract the float values from corresponding integers.
for line in eachline(extract_filepath)
data = map((x,p,d) -> _parse_data(x, p, d),
[strip(line[r]) for r in range_vec],
[eltype(a) for a in dtype_vec],
[d for d in dcml_vec])
push!(df, data)
end

return df;
end;


"""
_parse_census_numbers_to_float(strnum::SubString{String}, decimals::Int64)
This is an internal function, not for public use. The fixed width format
of the IPUMS DAT files encodes all numbers as integers. For example, a
number like 1015.45 is encoded as "10545". This function parses string
valued number into a floating point number, given the specified number of
decimals.
### Arguments
- `strnum::SubString{String}` - A number that is coded as a string.
- `decimals::Int64` - An integer that indicates the number of decimal place in a floating point number.
### Returns
This function returns the numeric part of the string, coded as an Float64 datatype.
"""
function _parse_census_numbers_to_float(strnum::SubString{String}, decimals::Int64)
return parse(Float64, chop(strnum, tail=decimals) * "." * last(strnum, decimals))
end


"""
_parse_data(strnum::SubString{String}, dtype::DataType, decimals::Int64)
This is an internal function to support the parsing of the fixed width
format of the IPUMS datafile. The file contains only numbers and absolutely
no text. This function determines--based upon DDI metadata--whether a
specific text input is designated as an integer or floating point number,
and then parses that value accordingly.
### Arguments
- `strnum::SubString{String}` - A string that may contain some numeric data encoded as text.
- `dtype::DataType` - The datatype that should be applied in the parsing of string number.
- `decimals::Int64` - The number of decimal values to include in a floating point number.
### Returns
This function returns the parsed number--integer or float--that corresponds to the input string.
"""
function _parse_data(strnum::SubString{String}, dtype::DataType, decimals::Int64)
if dtype == Float64
return _parse_census_numbers_to_float(strnum, decimals)
elseif dtype == Int64
return parse(dtype, strnum)
else
return strnum
end
end
4 changes: 4 additions & 0 deletions test/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,14 @@ end

@testset "DDI Parser" begin
ddi = parse_ddi("testdata/cps_00157.xml")
datafile = "testdata/cps_00157.dat"
df = load_ipums_extract(ddi, datafile)
@test ddi.extract_date == "2023-07-10"
@test ddi.variable_info[1].position_end == 4
@test ddi._ns == "ddi:codebook:2_5"
@test size(ddi.data_summary) == (8,6)
@test_throws ArgumentError parse_ddi("testdata/cps_00157.dat.gz")
@test_throws ArgumentError parse_ddi("testdata/cps_00156.xml")
@test isa(metadata(df), Dict)
@test isa(colmetadata(df, :YEAR), Dict)
end
Loading

0 comments on commit 1e97b67

Please sign in to comment.