Outlier_analysis.bib

@book{adamson2001,
  title = {{{SHRECK}}},
  shorttitle = {{{SHRECK}}},
  author = {Adamson, Andrew and Jenson, Vicky},
  year = {2001}
}

@book{aggarwal2017,
  title = {Outlier {{Analysis}}},
  author = {Aggarwal, Charu C.},
  year = {2017},
  publisher = {Springer International Publishing},
  address = {Cham},
  doi = {10.1007/978-3-319-47578-3},
  url = {http://link.springer.com/10.1007/978-3-319-47578-3},
  urldate = {2024-02-25},
  isbn = {978-3-319-47577-6 978-3-319-47578-3},
  langid = {english},
  keywords = {Anomaly detection,Artificial intelligence,Data mining,Machine learning,Matrix factorization,Network outlier detection,Novelty detection,Outlier Analysis,Outlier detection,Outlier ensembles,Spatial outliers,Streaming outlier detection,Temporal anomaly detection,Temporal outlier detection,Text outliers},
  file = {/Users/robwiederstein/Zotero/storage/EHIMJW8H/Aggarwal - 2017 - Outlier Analysis.pdf}
}

@article{campos2016,
  title = {On the Evaluation of Unsupervised Outlier Detection: Measures, Datasets, and an Empirical Study},
  shorttitle = {On the Evaluation of Unsupervised Outlier Detection},
  author = {Campos, Guilherme O. and Zimek, Arthur and Sander, J{\"o}rg and Campello, Ricardo J. G. B. and Micenkov{\'a}, Barbora and Schubert, Erich and Assent, Ira and Houle, Michael E.},
  year = {2016},
  month = jul,
  journal = {Data Mining and Knowledge Discovery},
  volume = {30},
  number = {4},
  pages = {891--927},
  issn = {1573-756X},
  doi = {10.1007/s10618-015-0444-8},
  url = {https://doi.org/10.1007/s10618-015-0444-8},
  urldate = {2024-02-26},
  abstract = {The evaluation of unsupervised outlier detection algorithms is a constant challenge in data mining research. Little is known regarding the strengths and weaknesses of different standard outlier detection models, and the impact of parameter choices for these algorithms. The scarcity of appropriate benchmark datasets with ground truth annotation is a significant impediment to the evaluation of outlier methods. Even when labeled datasets are available, their suitability for the outlier detection task is typically unknown. Furthermore, the biases of commonly-used evaluation measures are not fully understood. It is thus difficult to ascertain the extent to which newly-proposed outlier detection methods improve over established methods. In this paper, we perform an extensive experimental study on the performance of a representative set of standard k nearest neighborhood-based methods for unsupervised outlier detection, across a wide variety of datasets prepared for this purpose. Based on the overall performance of the outlier detection methods, we provide a characterization of the datasets themselves, and discuss their suitability as outlier detection benchmark sets. We also examine the most commonly-used measures for comparing the performance of different methods, and suggest adaptations that are more suitable for the evaluation of outlier detection results.},
  langid = {english},
  keywords = {Datasets,Evaluation,Measures,Unsupervised outlier detection},
  file = {/Users/robwiederstein/Zotero/storage/TZME2NSH/Campos et al. - 2016 - On the evaluation of unsupervised outlier detection measures, datasets, and an empirical study.pdf}
}

@article{goldstein2016,
  title = {A {{Comparative Evaluation}} of {{Unsupervised Anomaly Detection Algorithms}} for {{Multivariate Data}}},
  author = {Goldstein, Markus and Uchida, Seiichi},
  editor = {Zhu, Dongxiao},
  year = {2016},
  month = apr,
  journal = {PLOS ONE},
  volume = {11},
  number = {4},
  pages = {e0152173},
  issn = {1932-6203},
  doi = {10.1371/journal.pone.0152173},
  url = {https://dx.plos.org/10.1371/journal.pone.0152173},
  urldate = {2024-02-25},
  abstract = {Anomaly detection is the process of identifying unexpected items or events in datasets, which differ from the norm. In contrast to standard classification tasks, anomaly detection is often applied on unlabeled data, taking only the internal structure of the dataset into account. This challenge is known as unsupervised anomaly detection and is addressed in many practical applications, for example in network intrusion detection, fraud detection as well as in the life science and medical domain. Dozens of algorithms have been proposed in this area, but unfortunately the research community still lacks a comparative universal evaluation as well as common publicly available datasets. These shortcomings are addressed in this study, where 19 different unsupervised anomaly detection algorithms are evaluated on 10 different datasets from multiple application domains. By publishing the source code and the datasets, this paper aims to be a new well-funded basis for unsupervised anomaly detection research. Additionally, this evaluation reveals the strengths and weaknesses of the different approaches for the first time. Besides the anomaly detection performance, computational effort, the impact of parameter settings as well as the global/local anomaly detection behavior is outlined. As a conclusion, we give an advise on algorithm selection for typical real-world tasks.},
  langid = {english},
  file = {/Users/robwiederstein/Zotero/storage/8YKSFL2Q/Goldstein and Uchida - 2016 - A Comparative Evaluation of Unsupervised Anomaly Detection Algorithms for Multivariate Data.pdf}
}

@book{hawkins1980identification,
  title = {Identification of Outliers},
  author = {Hawkins, Douglas M},
  year = {1980},
  volume = {11},
  publisher = {Springer}
}

@article{pang2021deep,
  title = {Deep Learning for Anomaly Detection: {{A}} Review},
  author = {Pang, Guansong and Shen, Chunhua and Cao, Longbing and Hengel, Anton Van Den},
  year = {2021},
  journal = {ACM computing surveys (CSUR)},
  volume = {54},
  number = {2},
  pages = {1--38},
  publisher = {ACM New York, NY, USA}
}

@article{ruff2021,
  title = {A {{Unifying Review}} of {{Deep}} and {{Shallow Anomaly Detection}}},
  author = {Ruff, Lukas and Kauffmann, Jacob R. and Vandermeulen, Robert A. and Montavon, Gregoire and Samek, Wojciech and Kloft, Marius and Dietterich, Thomas G. and Muller, Klaus-Robert},
  year = {2021},
  month = may,
  journal = {Proceedings of the IEEE},
  volume = {109},
  number = {5},
  pages = {756--795},
  issn = {0018-9219, 1558-2256},
  doi = {10.1109/JPROC.2021.3052449},
  url = {https://ieeexplore.ieee.org/document/9347460/},
  urldate = {2024-02-26},
  langid = {english},
  file = {/Users/robwiederstein/Zotero/storage/SVYYH8LT/Ruff et al. - 2021 - A Unifying Review of Deep and Shallow Anomaly Detection.pdf}
}

@article{singh2012outlier,
  title = {Outlier Detection: Applications and Techniques},
  author = {Singh, Karanjit},
  year = {2012},
  journal = {International Journal of Computer Science Issues (IJCSI)},
  volume = {9},
  number = {1},
  pages = {307},
  publisher = {Citeseer},
  file = {/Users/robwiederstein/Zotero/storage/FEVCZC4L/Singh - 2012 - Outlier detection applications and techniques.pdf}
}

@article{talagala2021,
  title = {Anomaly {{Detection}} in {{High-Dimensional Data}}},
  author = {Talagala, Priyanga Dilini and Hyndman, Rob J. and {Smith-Miles}, Kate},
  year = {2021},
  month = jun,
  journal = {Journal of Computational and Graphical Statistics},
  volume = {30},
  number = {2},
  pages = {360--374},
  publisher = {Taylor \& Francis},
  issn = {1061-8600},
  doi = {10.1080/10618600.2020.1807997},
  url = {https://doi.org/10.1080/10618600.2020.1807997},
  urldate = {2024-02-25},
  abstract = {The HDoutliers algorithm is a powerful unsupervised algorithm for detecting anomalies in high-dimensional data, with a strong theoretical foundation. However, it suffers from some limitations that significantly hinder its performance level, under certain circumstances. In this article, we propose an algorithm that addresses these limitations. We define an anomaly as an observation where its k-nearest neighbor distance with the maximum gap is significantly different from what we would expect if the distribution of k-nearest neighbors with the maximum gap is in the maximum domain of attraction of the Gumbel distribution. An approach based on extreme value theory is used for the anomalous threshold calculation. Using various synthetic and real datasets, we demonstrate the wide applicability and usefulness of our algorithm, which we call the stray algorithm. We also demonstrate how this algorithm can assist in detecting anomalies present in other data structures using feature engineering. We show the situations where the stray algorithm outperforms the HDoutliers algorithm both in accuracy and computational time. This framework is implemented in the open source R package stray. Supplementary materials for this article are available online.},
  keywords = {anomaly,Extreme value theory,gumbel,High-dimensional data,knn,Nearest neighbor searching,pca,Temporal data,Unsupervised outlier detection},
  file = {/Users/robwiederstein/Zotero/storage/WQ3T9D7M/Talagala et al. - 2021 - Anomaly Detection in High-Dimensional Data.pdf}
}

@article{wilkinson2008scagnostics,
  title = {Scagnostics Distributions},
  author = {Wilkinson, Leland and Wills, Graham},
  year = {2008},
  journal = {Journal of Computational and Graphical Statistics},
  volume = {17},
  number = {2},
  pages = {473--491},
  publisher = {Taylor \& Francis},
  file = {/Users/robwiederstein/Zotero/storage/XNISWIWZ/Wilkinson and Wills - 2008 - Scagnostics distributions.pdf}
}

@article{wilkinson2018,
  title = {Visualizing {{Big Data Outliers Through Distributed Aggregation}}},
  author = {Wilkinson, Leland},
  year = {2018},
  month = jan,
  journal = {IEEE Transactions on Visualization and Computer Graphics},
  volume = {24},
  number = {1},
  pages = {256--266},
  issn = {1077-2626},
  doi = {10.1109/TVCG.2017.2744685},
  url = {http://ieeexplore.ieee.org/document/8019881/},
  urldate = {2024-02-25},
  abstract = {Visualizing outliers in massive datasets requires statistical pre-processing in order to reduce the scale of the problem to a size amenable to rendering systems like D3, Plotly or analytic systems like R or SAS. This paper presents a new algorithm, called hdoutliers, for detecting multidimensional outliers. It is unique for a) dealing with a mixture of categorical and continuous variables, b) dealing with big-p (many columns of data), c) dealing with big-n (many rows of data), d) dealing with outliers that mask other outliers, and e) dealing consistently with unidimensional and multidimensional datasets. Unlike ad hoc methods found in many machine learning papers, hdoutliers is based on a distributional model that allows outliers to be tagged with a probability. This critical feature reduces the likelihood of false discoveries.},
  langid = {english},
  file = {/Users/robwiederstein/Zotero/storage/BMH3S5HP/Wilkinson - 2018 - Visualizing Big Data Outliers Through Distributed Aggregation.pdf}
}

@misc{zotero-3553,
  title = {Outliers Detection in {{R}}},
  journal = {Stats and R},
  url = {https://statsandr.com/blog/outliers-detection-in-r/},
  urldate = {2024-02-25},
  abstract = {Learn how to detect outliers in R thanks to descriptive statistics and via the Hampel filter, the Grubbs, the Dixon and the Rosner tests for outliers},
  langid = {english},
  file = {/Users/robwiederstein/Zotero/storage/4YQK84YQ/outliers-detection-in-r.html}
}

@misc{zotero-3603,
  title = {{{ODDS}} -- {{Outlier Detection DataSets}}},
  url = {https://odds.cs.stonybrook.edu/},
  urldate = {2024-02-25},
  langid = {american},
  keywords = {datasets},
  file = {/Users/robwiederstein/Zotero/storage/EFLPRBYW/odds.cs.stonybrook.edu.html}
}

@misc{zotero-3702,
  title = {{{USDA ERS}} - {{Rural-Urban Commuting Area Codes}}},
  url = {https://www.ers.usda.gov/data-products/rural-urban-commuting-area-codes/},
  urldate = {2024-03-21},
  file = {/Users/robwiederstein/Zotero/storage/G9FWQTEC/rural-urban-commuting-area-codes.html}
}