blob: ded9030209d50cb389e4e75141ebc714f2a72644 [file] [log] [blame]
/*
* Copyright (c) 2014, Oracle and/or its affiliates. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* - Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* - Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* - Neither the name of Oracle nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
* IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
* THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/*
* This source code is provided to illustrate the usage of a given feature
* or technique and has been deliberately simplified. Additional steps
* required for a production-quality application, such as security checks,
* input validation, and proper error handling, might not be present in
* this sample code.
*/
import java.io.BufferedReader;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.*;
import java.util.function.*;
import java.util.regex.Pattern;
import java.util.stream.Collector;
import java.util.stream.Collectors;
import static java.lang.Double.parseDouble;
import static java.util.stream.Collectors.*;
/**
* CSVProcessor is a tool for processing CSV files. There are several
* command-line options. Consult the {@link #printUsageAndExit} method for
* instructions and command line parameters. This sample shows examples of the
* following features:
* <ul>
* <li>Lambda and bulk operations. Working with streams: map(...), filter(...),
* sorted(...) methods. The collect(...) method with different collectors:
* Collectors.maxBy(...), Collectors.minBy(...), Collectors.toList(),
* Collectors.toCollection(...), Collectors.groupingBy(...),
* Collectors.toDoubleSummaryStatistics(...), and a custom Collector.</li>
* <li>Static method reference for printing values.</li>
* <li>Try-with-resources feature for closing files.</li>
* <li>Switch by String feature.</li>
* <li>Other new APIs: Pattern.asPredicate(), BinaryOperator
* BufferedReader.lines(), Collection.forEach(...), Comparator.comparing(...),
* Comparator.reversed(), Arrays.stream(...).</li>
* </ul>
*
*/
public class CSVProcessor {
//Number of characters that may be read
private static final int READ_AHEAD_LIMIT = 100_000_000;
/**
* The main method for the CSVProcessor program. Run the program with an
* empty argument list to see possible arguments.
*
* @param args the argument list for CSVProcessor.
*/
public static void main(String[] args) {
if (args.length < 2) {
printUsageAndExit();
}
try (BufferedReader br = new BufferedReader(
Files.newBufferedReader(Paths.get(args[args.length - 1])))) {
//Assume that the first line contains column names.
List<String> header = Arrays.stream(br.readLine().split(","))
.map(String::trim).collect(toList());
//Calculate an index of the column in question.
int column = getColumnNumber(header, args[1]);
switch (args[0]) {
case "sort":
verifyArgumentNumber(args, 4);
//Define the sort order.
boolean isAsc;
switch (args[2].toUpperCase()) {
case "ASC":
isAsc = true;
break;
case "DESC":
isAsc = false;
break;
default:
printUsageAndExit("Illegal argument" + args[2]);
return;//Should not be reached.
}
/*
* Create a comparator that compares lines by comparing
* values in the specified column.
*/
Comparator<String> cmp
= Comparator.comparing(str -> getCell(str, column),
String.CASE_INSENSITIVE_ORDER);
/*
* sorted(...) is used to sort records.
* forEach(...) is used to output sorted records.
*/
br.lines().sorted(isAsc ? cmp : cmp.reversed())
.forEach(System.out::println);
break;
case "search":
verifyArgumentNumber(args, 4);
/*
* Records are filtered by a regex.
* forEach(...) is used to output filtered records.
*/
Predicate<String> pattern
= Pattern.compile(args[2]).asPredicate();
br.lines().filter(str -> pattern.test(getCell(str, column)))
.forEach(System.out::println);
break;
case "groupby":
verifyArgumentNumber(args, 3);
/*
* Group lines by values in the column with collect(...), and
* print with forEach(...) for every distinct value within
* the column.
*/
br.lines().collect(
Collectors.groupingBy(str -> getCell(str, column),
toCollection(TreeSet::new)))
.forEach((str, set) -> {
System.out.println(str + ":");
set.forEach(System.out::println);
});
break;
case "stat":
verifyArgumentNumber(args, 3);
/*
* BufferedReader will be read several times.
* Mark this point to return here after each pass.
* BufferedReader will be read right after the headers line
* because it is already read.
*/
br.mark(READ_AHEAD_LIMIT);
/*
* Statistics can be collected by a custom collector in one
* pass. One pass is preferable.
*/
System.out.println(
br.lines().collect(new Statistics(column)));
/*
* Alternatively, statistics can be collected
* by a built-in API in several passes.
* This method demonstrates how separate operations can be
* implemented using a built-in API.
*/
br.reset();
statInSeveralPasses(br, column);
break;
default:
printUsageAndExit("Illegal argument" + args[0]);
}
} catch (IOException e) {
printUsageAndExit(e.toString());
}
}
private static void statInSeveralPasses(BufferedReader br, int column)
throws IOException {
System.out.println("#-----Statistics in several passes-------#");
//Create a comparator to compare records by the column.
Comparator<String> comparator
= Comparator.comparing(
(String str) -> parseDouble(getCell(str, column)));
//Find max record by using Collectors.maxBy(...)
System.out.println(
"Max: " + br.lines().collect(maxBy(comparator)).get());
br.reset();
//Find min record by using Collectors.minBy(...)
System.out.println(
"Min: " + br.lines().collect(minBy(comparator)).get());
br.reset();
//Compute the average value and sum with
//Collectors.toDoubleSummaryStatistics(...)
DoubleSummaryStatistics doubleSummaryStatistics
= br.lines().collect(summarizingDouble(
str -> parseDouble(getCell(str, column))));
System.out.println("Average: " + doubleSummaryStatistics.getAverage());
System.out.println("Sum: " + doubleSummaryStatistics.getSum());
}
private static void verifyArgumentNumber(String[] args, int n) {
if (args.length != n) {
printUsageAndExit("Expected " + n + " arguments but was "
+ args.length);
}
}
private static int getColumnNumber(List<String> header, String name) {
int column = header.indexOf(name);
if (column == -1) {
printUsageAndExit("There is no column with name " + name);
}
return column;
}
private static String getCell(String record, int column) {
return record.split(",")[column].trim();
}
private static void printUsageAndExit(String... str) {
System.out.println("Usages:");
System.out.println("CSVProcessor sort COLUMN_NAME ASC|DESC FILE");
System.out.println("Sort lines by column COLUMN_NAME in CSV FILE\n");
System.out.println("CSVProcessor search COLUMN_NAME REGEX FILE");
System.out.println("Search for REGEX in column COLUMN_NAME in CSV FILE\n");
System.out.println("CSVProcessor groupby COLUMN_NAME FILE");
System.out.println("Split lines into different groups according to column "
+ "COLUMN_NAME value\n");
System.out.println("CSVProcessor stat COLUMN_NAME FILE");
System.out.println("Compute max/min/average/sum statistics by column "
+ "COLUMN_NAME\n");
Arrays.asList(str).forEach(System.err::println);
System.exit(1);
}
/*
* This is a custom implementation of the Collector interface.
* Statistics are objects gather max,min,sum,average statistics.
*/
private static class Statistics
implements Collector<String, Statistics, Statistics> {
/*
* This implementation does not need to be thread safe because
* the parallel implementation of
* {@link java.util.stream.Stream#collect Stream.collect()}
* provides the necessary partitioning and isolation for safe parallel
* execution.
*/
private String maxRecord;
private String minRecord;
private double sum;
private int lineCount;
private final BinaryOperator<String> maxOperator;
private final BinaryOperator<String> minOperator;
private final int column;
public Statistics(int column) {
this.column = column;
Comparator<String> cmp = Comparator.comparing(
(String str) -> parseDouble(getCell(str, column)));
maxOperator = BinaryOperator.maxBy(cmp);
minOperator = BinaryOperator.minBy(cmp);
}
/*
* Process line.
*/
public Statistics accept(String line) {
maxRecord = maxRecord == null
? line : maxOperator.apply(maxRecord, line);
minRecord = minRecord == null
? line : minOperator.apply(minRecord, line);
sum += parseDouble(getCell(line, column));
lineCount++;
return this;
}
/*
* Merge two Statistics.
*/
public Statistics combine(Statistics stat) {
maxRecord = maxOperator.apply(maxRecord, stat.getMaxRecord());
minRecord = minOperator.apply(minRecord, stat.getMinRecord());
sum += stat.getSum();
lineCount += stat.getLineCount();
return this;
}
@Override
public String toString() {
StringBuilder sb = new StringBuilder();
sb.append("#------Statistics------#\n");
sb.append("Max: ").append(getMaxRecord()).append("\n");
sb.append("Min: ").append(getMinRecord()).append("\n");
sb.append("Sum = ").append(getSum()).append("\n");
sb.append("Average = ").append(average()).append("\n");
sb.append("#------Statistics------#\n");
return sb.toString();
}
@Override
public Supplier<Statistics> supplier() {
return () -> new Statistics(column);
}
@Override
public BiConsumer<Statistics, String> accumulator() {
return Statistics::accept;
}
@Override
public BinaryOperator<Statistics> combiner() {
return Statistics::combine;
}
@Override
public Function<Statistics, Statistics> finisher() {
return stat -> stat;
}
@Override
public Set<Characteristics> characteristics() {
return EnumSet.of(Characteristics.IDENTITY_FINISH);
}
private String getMaxRecord() {
return maxRecord;
}
private String getMinRecord() {
return minRecord;
}
private double getSum() {
return sum;
}
private double average() {
return sum / lineCount;
}
private int getLineCount() {
return lineCount;
}
}
}