Google Interview Question
Software EngineersCountry: United States
example output:
---
$ javac DocComparisonInNgrams.java
$ java DocComparisonInNgrams
if n=1, then the shared ngrams are [Today,is] (count == 2)
if n=2, then the shared ngrams are [Today is] (count == 1)
if n=3, then the shared ngrams are [] (count == 0)
---
where DocComparisonInNgrams.java is:
import java.util.Set;
import java.util.HashSet;
import java.util.Map;
import java.util.HashMap;
import java.util.List;
import java.util.ArrayList;
public class DocComparisonInNgrams {
public static void main(String args[]) {
List<String> sampleDocumentsForTest = getSampleDocumentsForTest();
for (int n=1; n <= 3; n++) {
Set<String> result = findSharedNGrams(sampleDocumentsForTest, n);
String sharedNGrams = "[";
if (result == null) {
result = new HashSet<String>();
}
for(String sharedNGram : result) {
if (!sharedNGrams.equals("[")) { sharedNGrams += ","; }
sharedNGrams += sharedNGram;
}
sharedNGrams += "]";
System.out.println("if n=" + n + ", then the shared ngrams are " + sharedNGrams + " (count == " + result.size() + ")");
}
}
protected static Set<String> findSharedNGrams(List<String> inputDocuments, int numberOfWordsInGram) {
int numDocs = inputDocuments.size();
Set<String> sharedNGrams = null;
for (int docIdx = 0; docIdx < numDocs; docIdx++) {
Set<String> docNGrams = new HashSet<String>();
String doc = inputDocuments.get(docIdx);
// tokenize document into n-grams
// - split string into words
String[] docInWords = doc.split(" ");
for (int wordIdx = numberOfWordsInGram; wordIdx < docInWords.length; wordIdx++) {
String ngram = "";
for (int idx = wordIdx - numberOfWordsInGram; idx < wordIdx; idx++) {
if (!ngram.equals("")) { ngram += " "; }
ngram += docInWords[idx];
}
if (ngram.equals("")) {
continue;
}
docNGrams.add(ngram);
}
// if first document, build set from that document
if (sharedNGrams == null) {
sharedNGrams = docNGrams;
} else {
// for each subsequent document, intersect this document's set with existing set (shared n grams only)
Set<String> intersectedNGramSet = new HashSet<String>();
for (String ngram : docNGrams) {
if (sharedNGrams.contains(ngram)) {
intersectedNGramSet.add(ngram);
}
}
sharedNGrams = intersectedNGramSet;
}
}
return sharedNGrams;
}
protected static List<String> getSampleDocumentsForTest() {
List<String> sampleDocuments = new ArrayList<String>();
sampleDocuments.add("Today is Sunday.");
sampleDocuments.add("Today is Saturday.");
return sampleDocuments;
}
}
function findSharedNGrams(d1, d2){
var t1=d1.split(' ');
var t2=d2.split(' ');
var l1=t1.length,l2=t2.length;
var i=0,j=0;
const sharedGrams = [];
while(i<l1 || j<l2){
if(t1[i] !== t2[j]){
break;
}else{
sharedGrams.push(t1[i]);
}
i++;j++;
}
const map = new Map;
let k=1;
while(k<=3 && k<=sharedGrams.length){
let ww = []
for(let x=0;x<sharedGrams.length;x+=k){
ww.push(sharedGrams.slice(x,x+k));
}
map.set(k,ww);
k++;
}
for(let n=1;n<=3;n++){
const value = map.has(n) ? map.get(n).join(" ") : "[]";
const size = map.has(n) ? map.get(n).length : 0;
console.log(`if n = ${n} then number of duplicates is ${size} (${value})`)
}
}
var d1 = "Today had an awesome lunch";
var d2 = "Today had an Very Tiring day";
findSharedNGrams(d1, d2);
What I understood is here in JS format:
function findSharedNGrams(d1, d2){
var t1=d1.split(' ');
var t2=d2.split(' ');
var l1=t1.length,l2=t2.length;
var i=0,j=0;
const sharedGrams = [];
while(i<l1 || j<l2){
if(t1[i] !== t2[j]){
break;
}else{
sharedGrams.push(t1[i]);
}
i++;j++;
}
const map = new Map;
let k=1;
while(k<=3 && k<=sharedGrams.length){
let ww = []
for(let x=0;x<sharedGrams.length;x+=k){
ww.push(sharedGrams.slice(x,x+k));
}
map.set(k,ww);
k++;
}
for(let n=1;n<=3;n++){
const value = map.has(n) ? map.get(n).join(" ") : "[]";
const size = map.has(n) ? map.get(n).length : 0;
console.log(`if n = ${n} then number of duplicates is ${size} (${value})`)
}
}
var d1 = "Today had an awesome lunch";
var d2 = "Today had an Very Tiring day";
findSharedNGrams(d1, d2);
- LANorth July 06, 2019