Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

created the load_ipums_extract() function, test cases, etc. #21

Merged
merged 1 commit into from
May 6, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions src/IPUMS.jl
Original file line number Diff line number Diff line change
Expand Up @@ -59,4 +59,5 @@ module IPUMS
export IPUMSAPI
export parse_ddi
export extract_download
export load_ipums_extract
end
175 changes: 173 additions & 2 deletions src/parsers/ddi_parser.jl
Original file line number Diff line number Diff line change
Expand Up @@ -46,9 +46,9 @@ function parse_ddi(filepath::String)
# get variables metadata from file
_get_var_metadata_from_ddi!(ddi)

return ddi
return ddi;

end
end;


"""
Expand Down Expand Up @@ -80,6 +80,35 @@ function _check_that_file_is_xml(filepath::String)
end


"""
_check_that_file_is_dat(filepath::String)

This is an internal function and checks whether the provided file is a DAT
file. All IPUMS extract data files should be in DAT format.

### Arguments

- `filepath::String` - A file path that the user wishes to import. The file must be
a DAT file.

### Returns

The function returns nothing if the file is a DAT file. If the file is not
a DAT file, then the function raises an `ArgumentError`.
"""
function _check_that_file_is_dat(filepath::String)

extension = filepath[findlast(==('.'), filepath)+1:end]

if extension != "dat"
throw(ArgumentError("The IPUMS Extract File: $filepath should be an DAT file."))
else
return true
end

end


"""
_check_that_file_exists(filepath::String)

Expand Down Expand Up @@ -308,3 +337,145 @@ function _string_to_num(x::SubString{String})
return parse(Int64, n)
end


"""
load_ipums_extract(ddi::DDIInfo, extract_filepath::String)

This file will take in a parsed DDIInfo object and file path to an IPUMS
DAT extract file, and returns a dataframe containing all of the data.

### Arguments

- `ddi::DDIInfo` - A DDIInfo object, which is the result of parsing a DDI metadata file.
- `extract_filepath::String` - The directory path to an IPUMS extract DAT file.

### Returns

This function outputs a Julia Dataframe that contains all of the data from
the IPUMS extract file. Further, the metadata fields of the Dataframe
contain the metadata parsed from the DDI file.

# Examples

Let's assume we have an extract DDI file named `my_extract.xml`, and an extract
file called `my_extract.dat`.

```julia-repl
julia> ddi = parse_ddi("my_extract.xml");
julia> df = load_ipums_extract(ddi, "my_extract.dat");
```

"""
function load_ipums_extract(ddi::DDIInfo, extract_filepath::String)

# Check that the extract file exists and is a DAT file.
_check_that_file_is_dat(extract_filepath)
_check_that_file_exists(extract_filepath)

# Load ddi column-level metadata to local variables
name_vec = [v.name for v in ddi.variable_info]
range_vec = [v.position_start:v.position_end for v in ddi.variable_info]
dtype_vec = [v.var_dtype[] for v in ddi.variable_info]
dcml_vec = [v.dcml for v in ddi.variable_info]

# Create empty dataframe

df = DataFrame(dtype_vec, name_vec)

# Save extract level metadata to dataframe. This data applies to the entire
# file.

metadata!(df, "conditions", ddi.conditions, style=:note);
metadata!(df, "citation", ddi.citation, style=:note);
metadata!(df, "ipums_project", ddi.ipums_project, style=:note);
metadata!(df, "extract_notes", ddi.extract_notes, style=:note);
metadata!(df, "extract_date", ddi.extract_date, style=:note);

# Setup and write column level metadata to dataframe.
fields = Dict("label" => :labl,
"description" => :desc,
"data type" => :var_dtype,
"numeric type" => :var_interval,
"coding instructions" => :coder_instructions,
"category labels" => :category_labels)

# Iterate over each column in the dataset and save the corresponding metadata
# dictionary for that column to the dataframe.
for i in eachindex(ddi.variable_info)
for (k,v) in fields
colmetadata!(df,
ddi.variable_info[i].name,
k,
getfield(ddi.variable_info[i], v),
style=:note);
end
end

# Load extract data to the dataframe. The original ipums extract is a
# fixed width file with no delimiters. The original file saves float-valued
# variables as integers, hence we must to parse the file to correctly
# extract the float values from corresponding integers.
for line in eachline(extract_filepath)
data = map((x,p,d) -> _parse_data(x, p, d),
[strip(line[r]) for r in range_vec],
[eltype(a) for a in dtype_vec],
[d for d in dcml_vec])
push!(df, data)
end

return df;
end;


"""
_parse_census_numbers_to_float(strnum::SubString{String}, decimals::Int64)

This is an internal function, not for public use. The fixed width format
of the IPUMS DAT files encodes all numbers as integers. For example, a
number like 1015.45 is encoded as "10545". This function parses string
valued number into a floating point number, given the specified number of
decimals.

### Arguments

- `strnum::SubString{String}` - A number that is coded as a string.
- `decimals::Int64` - An integer that indicates the number of decimal place in a floating point number.

### Returns

This function returns the numeric part of the string, coded as an Float64 datatype.

"""
function _parse_census_numbers_to_float(strnum::SubString{String}, decimals::Int64)
return parse(Float64, chop(strnum, tail=decimals) * "." * last(strnum, decimals))
end


"""
_parse_data(strnum::SubString{String}, dtype::DataType, decimals::Int64)

This is an internal function to support the parsing of the fixed width
format of the IPUMS datafile. The file contains only numbers and absolutely
no text. This function determines--based upon DDI metadata--whether a
specific text input is designated as an integer or floating point number,
and then parses that value accordingly.

### Arguments

- `strnum::SubString{String}` - A string that may contain some numeric data encoded as text.
- `dtype::DataType` - The datatype that should be applied in the parsing of string number.
- `decimals::Int64` - The number of decimal values to include in a floating point number.
### Returns

This function returns the parsed number--integer or float--that corresponds to the input string.

"""
function _parse_data(strnum::SubString{String}, dtype::DataType, decimals::Int64)
if dtype == Float64
return _parse_census_numbers_to_float(strnum, decimals)
elseif dtype == Int64
return parse(dtype, strnum)
else
return strnum
end
end
4 changes: 4 additions & 0 deletions test/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,14 @@ end

@testset "DDI Parser" begin
ddi = parse_ddi("testdata/cps_00157.xml")
datafile = "testdata/cps_00157.dat"
df = load_ipums_extract(ddi, datafile)
@test ddi.extract_date == "2023-07-10"
@test ddi.variable_info[1].position_end == 4
@test ddi._ns == "ddi:codebook:2_5"
@test size(ddi.data_summary) == (8,6)
@test_throws ArgumentError parse_ddi("testdata/cps_00157.dat.gz")
@test_throws ArgumentError parse_ddi("testdata/cps_00156.xml")
@test isa(metadata(df), Dict)
@test isa(colmetadata(df, :YEAR), Dict)
end
Loading
Loading