blob: b02525d727e417a381c73c6c0dc282bf447799d0 [file] [log] [blame]
// © 2020 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
package org.unicode.cldr.rdf;
import org.apache.jena.arq.querybuilder.SelectBuilder;
import org.apache.jena.query.Query;
import org.apache.jena.query.QuerySolution;
import org.apache.jena.query.ResultSet;
import org.apache.jena.sparql.lang.sparql_11.ParseException;
import org.unicode.cldr.util.XPathParts;
/**
* an XPathMapper responsible for language display names
* @author srl295
*/
public class LanguageMapper implements XPathMapper {
static final String O_LANGUAGE = "?language";
static final String DBO_639_3 = "dbo:iso6393Code";
static final String DBO_639_2 = "dbo:iso6392Code";
static final String DBO_639_1 = "dbo:iso6391Code";
static final boolean DEBUG = false;
public LanguageMapper() {
}
@Override
public int addEntries(AbstractCache cache) throws ParseException {
int newAdd = 0;
ResultSet rs = queryLanguageResources();
XPathParts xpp = XPathParts.getFrozenInstance("//ldml/localeDisplayNames/languages/language").cloneAsThawed();
while(rs.hasNext()) {
final QuerySolution qs = rs.next();
String code = QueryClient.getLiteralOrNull(qs, "iso6391");
if(code == null) {
code = QueryClient.getLiteralOrNull(qs, "iso6392");
}
final String res = QueryClient.getResourceOrNull(qs, "language");
if (code == null) {
// SPARQL should have filtered out this entry.
System.err.println("!!! SPARQL error: No code for " + res);
continue;
}
if (code.isEmpty()) {
if (DEBUG) {
System.err.println("!!! Empty code for " + res);
}
// This happens for https://dbpedia.org/page/Grebo_language which has a junk extra entry
continue;
}
if(code.length() != 3 && code.length() != 2) {
if(DEBUG) System.out.println("!!!" + rs.getRowNumber() + " - " + code + " " + res);
int spaceLoc = code.indexOf(' ');
if(!Character.isLowerCase(code.charAt(0))
|| (spaceLoc != 2 && spaceLoc != 3)) {
// Happens for Igbo
System.err.println("Rejecting wrong-length " + code + " for " + res);
continue;
}
code = code.substring(0, spaceLoc);
if(DEBUG) System.out.println("Fixed=> " + code);
} else {
if(DEBUG) System.out.println("== " + rs.getRowNumber() + " - " + code + " " + res);
}
xpp.setAttribute(-1, "type", code);
final String xpath = xpp.toString();
if(DEBUG) System.out.println("\t"+res+"\t"+xpath);
if(cache.add(xpath, res) == false) {
newAdd ++;
}
}
return newAdd;
}
public static ResultSet queryLanguageResources() throws ParseException {
final String resType = "?language";
final SelectBuilder builder = new SelectBuilder()
.addPrefix("dbo", QueryClient.PREFIX_DBO)
.addPrefix("dbr", QueryClient.PREFIX_DBR)
.addPrefix("rdf", QueryClient.PREFIX_RDF)
// .addPrefix("plg", PREFIX_PLG)
.addVar("*")
.addWhere(resType, "rdf:type", "dbo:Language")
.addWhere(resType, "<"+QueryClient.PREFIX_PLG+"hypernym"+">", "dbr:Language") // Only language (not dialect etc)
.addOptional(resType, "dbo:iso6392Code", "?iso6392")
.addOptional(resType, "dbo:iso6391Code", "?iso6391")
// .addOptional(resType, "dbo:iso6393Code", "?iso6393") // Future: 639-3 code
.addFilter("(bound(?iso6391) || bound(?iso6392))") // At least 639-1 or 639-2, or both
;
System.out.println(builder.buildString());
Query q = builder.build();
ResultSet results = QueryClient.getInstance().execSelect(q);
return results;
}
}