IBM Interview Question
Software Engineer / DevelopersCountry: United States
Not in Java, but this one's in Python
import re
def clean_input(text):
#non-alphanumeric character should be ignored
text = re.sub('[^a-zA-Z\s]', '', text)
#Any other block of contiguous whitespace should be treated as a single space
#white space should be retained
text = re.sub(' +',' ',text)
#Leading and trailing whitespace should be ignored
text = text.strip(' \t\n\r')
# You probably want this too
text = text.lower()
return text
def process(text):
#If they are also the same length, the earlier one in the input sequence should be kept.
# Using arrays (can use OrderedDict too, probably easier and nice, although below is clearer for you.
original_parts = text.split('|')
clean_parts = [clean_input(x) for x in original_parts]
original_parts_to_check = []
ignore_idx = []
for idx, ele in enumerate(original_parts):
if idx in ignore_idx:
continue
#The case of alphabetic characters should be ignored
if len(ele) < 2:
continue
#Duplicates must also be filtered -if two passages are considered equal with respect to the comparison rules listed above, only the shortest should be retained.
if clean_parts[idx] in clean_parts[:idx]+clean_parts[idx+1:]:
indices = [i for i, x in enumerate(clean_parts) if x == clean_parts[idx]]
use = indices[0]
for i in indices[1:]:
if len(original_parts[i]) < len(original_parts[use]):
use = i
if idx == use:
ignore_idx += indices
else:
ignore_idx += [x for x in indices if x != use]
continue
original_parts_to_check.append(idx)
# Doing the text in text matching here. Depending on size and type of dataset,
# Which you should test as it would affect this, you may want this as part
# of the function above, or before, etc. If you're only doing 100 bits of
# data, then it doesn't matter. Optimize accordingly.
text_to_return = []
clean_to_check = [clean_parts[x] for x in original_parts_to_check]
for idx in original_parts_to_check:
# This bit can be done better, but I have no more time to work on this.
if any([(clean_parts[idx] in clean_text) for clean_text in [x for x in clean_to_check if x != clean_parts[idx]]]):
continue
text_to_return.append(original_parts[idx])
#The retained passages should be output in their original form (identical to the input passage), and in the same order.
return '|'.join(text_to_return)
assert(process('IBM cognitive computing|IBM "cognitive" computing is a revolution| ibm cognitive computing|\'IBM Cognitive Computing\' is a revolution?') ==
'IBM "cognitive" computing is a revolution')
print(process('IBM cognitive computing|IBM "cognitive" computing is a revolution| ibm cognitive computing|\'IBM Cognitive Computing\' is a revolution?'))
assert(process('IBM cognitive computing|IBM "cognitive" computing is a revolution|the cognitive computing is a revolution') ==
'IBM "cognitive" computing is a revolution|the cognitive computing is a revolution')
print(process('IBM cognitive computing|IBM "cognitive" computing is a revolution|the cognitive computing is a revolution'))
Due to peoples request. Guys, I think it is good to learn pseudocoding on functional style.
Then, you can apply that to get faster, better imperative coding. NOTE: Java is not a very decent language for learning coding, python is much better.
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* Created by NoOne on 17/10/16.
*/
public class Main {
static String[] s = new String[] { "IBM cognitive computing" , "IBM \"cognitive\" computing is a revolution" ,
"ibm cognitive computing" , "'IBM Cognitive Computing' is a revolution?" };
static List<String> tokenize(String text, String regex){
Pattern p = Pattern.compile( regex , Pattern.DOTALL);
Matcher m = p.matcher( text );
List<String> l = new ArrayList<>();
while ( m.find() ){
l.add(m.group());
}
return l;
}
static String keyString( List l, String sep){
StringBuffer buf = new StringBuffer();
for ( Object o : l ){
buf.append( o ).append(sep);
}
String s = buf.toString();
return s.substring(0,s.length());
}
static void doIBM( String[] s ){
String previousKey = "" ;
String previousValue = "" ;
for ( String item : s ){
List words = tokenize( item.toLowerCase() , "[a-z0-9]+");
String key = keyString( words , " " );
if ( key.indexOf( previousKey ) < 0 ||
( previousKey.equals(key) && item.length() > previousValue.length() ) ){
continue;
}
previousKey = key ;
previousValue = item ;
}
System.out.println(previousValue);
}
public static void main(String[] args){
doIBM(s);
}
}
import java.util.*;
public class Main{
public static String test1 = "IBM cognitive computing|IBM \"cognitive\" computing is a revolution|ibm cognitive computing|\'IBM Cognitive Computing\' is a revolution?";
public static String test2 = "IBM cognitive computing|IBM \"cognitive\" computing 'is' a revolution|the cognitive computing is a revolution";
public static ArrayList<String> unique = new ArrayList<String>();
public static ArrayList<String> uniqueR = new ArrayList<String>();
public static void main(String [] args){
String [] temp = test1.split("\\|");
//.replaceAll("[\\-\\+\\.\\^:,\"\']","")
System.out.println(Arrays.toString(temp));
for (int i = 0; i<temp.length; i++){
String test = temp[i].replaceAll("[\\-\\+\\.\\^:,\"\'?]","").toLowerCase();
boolean tz = checkUnique(temp, test);
if(tz) {
unique.add(temp[i]);
uniqueR.add(test);
System.out.println("ADDED UNIQUE: " + temp[i]);
}
}
System.out.println("UNIQUES ARE: " + unique);
}
public static boolean checkUnique(String [] temp, String test){
for(int x = 0; x<temp.length; x++){
String test2 = temp[x].replaceAll("[\\-\\+\\.\\^:,\"\'?]","").toLowerCase();
if(test2.equals(test)){
if(uniqueR.contains(test2)) {
return false;
}else{
continue;
}
}
System.out.println("TESTING: (" + test + ") with (" + test2 + ")");
if(test2.contains(test))
return false;
}
return true;
}
}
IMHO, using regex & trie seems to have a better complexity, O(kN), where k is the average length of each passage and N denotes the number of passages.
import java.util.ArrayList;
import java.util.LinkedHashMap;
/*
#1 Ignore leading or trailing whitespace
#2 Case insensitive
#3 Contiguous whitespace should be treated as a single space
#4 Non-alphanumeric character should be ignored
#5 Smaller length when same norm
#6 Eariler index when same length
#7 Output should be in order same as input
*/
public class Normalizer_1st{
public class Pair{
int index;
int length;
public Pair(int index, int length){
this.index = index;
this.length = length;
}
}
public class Trie{
LinkedHashMap<Character, Trie> children;
ArrayList<Pair> pairs;
public Trie(){
children = new LinkedHashMap<>();
pairs = new ArrayList<>();
}
public void add(String s, Pair pair){
if(s == null || s.length() == 0){
pairs.add(pair);
}
else{
char c = s.charAt(0);
Trie child = null;
if(children.containsKey(c)){
child = children.get(c);
}
else{
child = new Trie();
children.put(c, child);
}
String remainder = s.substring(1);
child.add(remainder, pair);
}
}
public ArrayList<Integer> getIndexes(){
ArrayList<Integer> list = new ArrayList<>();
StringBuilder sb = new StringBuilder();
int level = 0;
getIndexes(list, sb, level);
return list;
}
private int min(ArrayList<Pair> pairs){
int minIndex = Integer.MAX_VALUE;
int minLength = Integer.MAX_VALUE;
for(Pair pair : pairs){
int index = pair.index;
int length = pair.length;
if(length < minLength || (length == minLength && index < minIndex)){
minIndex = index;
minLength = length;
}
}
return minIndex;
}
private void getIndexes(ArrayList<Integer> list, StringBuilder sb, int level){
if(children.isEmpty()){
int index = min(pairs);
list.add(index);
return;
}
else{
for(char c : children.keySet()){
Trie child = children.get(c);
sb.append(c);
child.getIndexes(list, sb, level + 1);
sb.deleteCharAt(level);
}
}
}
}
public String normalize(String s){
s = s.toLowerCase();
s = s.replaceAll("^[\\s]+|[\\s]+$", "");
s = s.replaceAll("[^\\s\\w\\|]", "");
s = s.replaceAll("[\\s]+(?=\\|)|(?<=\\|)[\\s]+", "");
s = s.replaceAll("[\\s]+(?=[\\s])", " ");
return s;
}
public String process(String s){
String[] array = s.split("\\|");
Trie trie = new Trie();
for(int i = 0; i < array.length; i++)
trie.add(normalize(array[i]), new Pair(i, array[i].length()));
ArrayList<Integer> list = trie.getIndexes();
ArrayList<String> result = new ArrayList<>();
for(int i : list)
result.add(array[i]);
return String.join("|", result);
}
public static void main(String[] args){
Normalizer_1st tmp = new Normalizer_1st();
String input1 = "IBM cognitive computing|IBM \"cognitive\" computing is a revolution| ibm cognitive computing|'IBM Cognitive Computing' is a revolution?";
String input2 = "IBM cognitive computing|IBM \"cognitive\" computing is a revolution|the cognitive computing is a revolution";
String result = tmp.process(input1);
System.out.println(result);
result = tmp.process(input2);
System.out.println(result);
}
}
Explanation. We note that the statements can be mapped to a key.
- NoOne October 14, 2016A key that is to be used for comparison. Obviously, if key1 is contains in key2, we need to pick key2. That is inscribed in the code.