2009
Schulz, Hannes; Kersting, Kristian; Karwath, Andreas
ILP, the Blind, and the Elephant: Euclidean Embedding of Co-proven Queries Conference
Inductive Logic Programming, 19th International Conference, ILP 2009, Springer-Verlag Berlin Heidelberg Springer Verlag, Berlin Heidelberg, Germany, 2009, ISBN: 978-3-642-13839-3.
Abstract | Links | BibTeX | Tags: cheminformatics, dimensionality reduction, inductive logic programming, relational learning, scientific knowledge, visualization
@conference{schulz2009,
title = {ILP, the Blind, and the Elephant: Euclidean Embedding of Co-proven Queries},
author = {Hannes Schulz and Kristian Kersting and Andreas Karwath},
url = {http://dx.doi.org/10.1007/978-3-642-13840-9_20},
doi = {10.1007/978-3-642-13840-9_20},
isbn = {978-3-642-13839-3},
year = {2009},
date = {2009-01-01},
booktitle = {Inductive Logic Programming, 19th International Conference, ILP 2009},
pages = {209-216},
publisher = {Springer Verlag},
address = {Berlin Heidelberg, Germany},
organization = {Springer-Verlag Berlin Heidelberg},
crossref = {DBLP:conf/ilp/2009},
abstract = {Relational data is complex. This complexity makes one of the basic steps of ILP difficult: understanding the data and results. If the user cannot easily understand it, he draws incomplete conclusions. The situation is very much as in the parable of the blind men and the elephant that appears in many cultures. In this tale the blind work independently and with quite different pieces of information, thereby drawing very different conclusions about the nature of the beast. In contrast, visual representations make it easy to shift from one perspective to another while exploring and analyzing data. This paper describes a method for embedding interpretations and queries into a single, common Euclidean space based on their co-proven statistics. We demonstrate our method on real-world datasets showing that ILP results can indeed be captured at a glance.},
keywords = {cheminformatics, dimensionality reduction, inductive logic programming, relational learning, scientific knowledge, visualization},
pubstate = {published},
tppubtype = {conference}
}
2008
Karwath, Andreas; Kersting, Kristian; Landwehr, Niels
Boosting Relational Sequence Alignments Conference
The 8th IEEE International Conference on Data Mining, ICDM 2008, IEEE, 2008, ISBN: 978-0-7695-3502-9.
Abstract | Links | BibTeX | Tags: inductive logic programming, machine learning, relational learning, scientific knowledge
@conference{karwath2008,
title = {Boosting Relational Sequence Alignments},
author = {Andreas Karwath and Kristian Kersting and Niels Landwehr},
url = {http://dx.doi.org/10.1109/ICDM.2008.127},
doi = {10.1109/ICDM.2008.127},
isbn = {978-0-7695-3502-9},
year = {2008},
date = {2008-12-15},
booktitle = {The 8th IEEE International Conference on Data Mining, ICDM 2008},
pages = {857-862},
publisher = {IEEE},
crossref = {DBLP:conf/icdm/2008},
abstract = {The task of aligning sequences arises in many applications. Classical dynamic programming approaches require the explicit state enumeration in the reward model. This is often impractical: the number of states grows very quickly with the number of domain objects and relations among these objects. Relational sequence alignment aims at exploiting symbolic structure to avoid the full enumeration. This comes at the expense of a more complex reward model selection problem: virtually infinitely many abstraction levels have to be explored. In this paper, we apply gradient-based boosting to leverage this problem. Specifically, we show how to reduce the learning problem to a series of relational regressions problems. The main benefit of this is that interactions between states variables are introduced only as needed, so that the potentially infinite search space is not explicitly considered. As our experimental results show, this boosting approach can significantly improve upon established results in challenging applications.},
keywords = {inductive logic programming, machine learning, relational learning, scientific knowledge},
pubstate = {published},
tppubtype = {conference}
}
Kersting, Kristian; De Raedt, Luc; Gutmann, Bernd; Karwath, Andreas; Landwehr, Niels
Relational Sequence Learning Book Chapter
In: Probabilistic Inductive Logic Programming - Theory and Applications, vol. 4911, pp. 28-55, Springer Verlag, Berlin Heidelberg, Germany, 2008, ISBN: 978-3-540-78651-1.
Abstract | Links | BibTeX | Tags: inductive logic programming, machine learning, relational learning, scientific knowledge
@inbook{kersting2008,
title = {Relational Sequence Learning},
author = {Kristian Kersting and De Raedt, Luc and Bernd Gutmann and Andreas Karwath and Niels Landwehr},
url = {http://dx.doi.org/10.1007/978-3-540-78652-8_2},
doi = {10.1007/978-3-540-78652-8_2},
isbn = {978-3-540-78651-1},
year = {2008},
date = {2008-01-01},
booktitle = {Probabilistic Inductive Logic Programming - Theory and Applications},
volume = {4911},
pages = {28-55},
publisher = {Springer Verlag},
address = {Berlin Heidelberg, Germany},
organization = {Springer-Verlag Berlin Heidelberg},
crossref = {DBLP:conf/ilp/2008p},
abstract = {Sequential behavior and sequence learning are essential to intelligence. Often the elements of sequences exhibit an internal structure that can elegantly be represented using relational atoms. Applying traditional sequential learning techniques to such relational sequences requires one either to ignore the internal structure or to live with a combinatorial explosion of the model complexity. This chapter briefly reviews relational sequence learning and describes several techniques tailored towards realizing this, such as local pattern mining techniques, (hidden) Markov models, conditional random fields, dynamic programming and reinforcement learning.},
keywords = {inductive logic programming, machine learning, relational learning, scientific knowledge},
pubstate = {published},
tppubtype = {inbook}
}
2007
Karwath, Andreas; Kersting, Kristian
Relational Sequence Alignments and Logos Conference
Inductive Logic Programming, 16th International Conference, ILP 2006, vol. 4455, Lecture Notes in Computer Science Springer-Verlag Berlin Heidelberg Springer Verlag, Berlin Heidelberg, Germany, 2007, ISBN: 978-3-540-73846-6.
Abstract | Links | BibTeX | Tags: bioinformatics, inductive logic programming, relational learning, scientific knowledge
@conference{karwath2007,
title = {Relational Sequence Alignments and Logos},
author = {Andreas Karwath and Kristian Kersting},
url = {http://dx.doi.org/10.1007/978-3-540-73847-3_29},
doi = {10.1007/978-3-540-73847-3_29},
isbn = {978-3-540-73846-6},
year = {2007},
date = {2007-01-01},
booktitle = {Inductive Logic Programming, 16th International Conference, ILP 2006},
volume = {4455},
pages = {290-304},
publisher = {Springer Verlag},
address = {Berlin Heidelberg, Germany},
organization = {Springer-Verlag Berlin Heidelberg},
series = {Lecture Notes in Computer Science},
crossref = {DBLP:conf/ilp/2006},
abstract = {The need to measure sequence similarity arises in many applicitation domains and often coincides with sequence alignment: the more similar two sequences are, the better they can be aligned. Aligning sequences not only shows how similar sequences are, it also shows where there are differences and correspondences between the sequences.
Traditionally, the alignment has been considered for sequences of flat symbols only. Many real world sequences such as natural language sentences and protein secondary structures, however, exhibit rich internal structures. This is akin to the problem of dealing with structured examples studied in the field of inductive logic programming (ILP). In this paper, we introduce Real, which is a powerful, yet simple approach to align sequence of structured symbols using well-established ILP distance measures within traditional alignment methods. Although straight-forward, experiments on protein data and Medline abstracts show that this approach works well in practice, that the resulting alignments can indeed provide more information than flat ones, and that they are meaningful to experts when represented graphically.},
keywords = {bioinformatics, inductive logic programming, relational learning, scientific knowledge},
pubstate = {published},
tppubtype = {conference}
}
Traditionally, the alignment has been considered for sequences of flat symbols only. Many real world sequences such as natural language sentences and protein secondary structures, however, exhibit rich internal structures. This is akin to the problem of dealing with structured examples studied in the field of inductive logic programming (ILP). In this paper, we introduce Real, which is a powerful, yet simple approach to align sequence of structured symbols using well-established ILP distance measures within traditional alignment methods. Although straight-forward, experiments on protein data and Medline abstracts show that this approach works well in practice, that the resulting alignments can indeed provide more information than flat ones, and that they are meaningful to experts when represented graphically.
King, Ross D.; Karwath, Andreas; Clare, Amanda; Dehaspe, Luc
Logic and the Automatic Acquisition of Scientific Knowledge: An Application to Functional Genomics Conference
Computational Discovery of Scientific Knowledge, Introduction, Techniques, and Applications in Environmental and Life Sciences, vol. 4660, Lecture Notes in Computer Science Springer-Verlag Berlin Heidelberg Springer Verlag, Berlin Heidelberg, Germany, 2007, ISBN: 978-3-540-73919-7.
Abstract | Links | BibTeX | Tags: bioinformatics, data mining, inductive logic programming, machine learning, relational learning, scientific knowledge
@conference{king2007,
title = {Logic and the Automatic Acquisition of Scientific Knowledge: An Application to Functional Genomics},
author = {Ross D. King and Andreas Karwath and Amanda Clare and Luc Dehaspe},
url = {http://dx.doi.org/10.1007/978-3-540-73920-3_13},
doi = {10.1007/978-3-540-73920-3_13},
isbn = {978-3-540-73919-7},
year = {2007},
date = {2007-01-01},
booktitle = {Computational Discovery of Scientific Knowledge, Introduction, Techniques, and Applications in Environmental and Life Sciences},
volume = {4660},
pages = {273-289},
publisher = {Springer Verlag},
address = {Berlin Heidelberg, Germany},
organization = {Springer-Verlag Berlin Heidelberg},
series = {Lecture Notes in Computer Science},
crossref = {DBLP:conf/dis/2007book},
abstract = {This paper is a manifesto aimed at computer scientists interested in developing and applying scientific discovery methods. It argues that: science is experiencing an unprecedented “explosion” in the amount of available data; traditional data analysis methods cannot deal with this increased quantity of data; there is an urgent need to automate the process of refining scientific data into scientific knowledge; inductive logic programming (ILP) is a data analysis framework well suited for this task; and exciting new scientific discoveries can be achieved using ILP scientific discovery methods. We describe an example of using ILP to analyse a large and complex bioinformatic database that has produced unexpected and interesting scientific results in functional genomics. We then point a possible way forward to integrating machine learning with scientific databases to form intelligent databases.},
keywords = {bioinformatics, data mining, inductive logic programming, machine learning, relational learning, scientific knowledge},
pubstate = {published},
tppubtype = {conference}
}
2006
Clare, Amanda; Karwath, Andreas; Ougham, Helen; King, Ross D.
Functional bioinformatics for Arabidopsis thaliana Journal Article
In: Bioinformatics, vol. 22, no. 9, pp. 1130-1136, 2006.
Abstract | Links | BibTeX | Tags: bioinformatics, data mining, inductive logic programming, machine learning, relational learning, scientific knowledge
@article{karwath06a,
title = {Functional bioinformatics for Arabidopsis thaliana},
author = {Amanda Clare and Andreas Karwath and Helen Ougham and Ross D. King},
url = {https://bioinformatics.oxfordjournals.org/content/22/9/1130.full.pdf+html},
doi = {10.1093/bioinformatics/btl051},
year = {2006},
date = {2006-01-01},
journal = {Bioinformatics},
volume = {22},
number = {9},
pages = {1130-1136},
abstract = {Motivation: The genome of Arabidopsis thaliana, which has the best understood plant genome, still has approximately one-third of its genes with no functional annotation at all from either MIPS or TAIR. We have applied our Data Mining Prediction (DMP) method to the problem of predicting the functional classes of these protein sequences. This method is based on using a hybrid machine-learning/data-mining method to identify patterns in the bioinformatic data about sequences that are predictive of function. We use data about sequence, predicted secondary structure, predicted structural domain, InterPro patterns, sequence similarity profile and expressions data.
Results: We predicted the functional class of a high percentage of the Arabidopsis genes with currently unknown function. These predictions are interpretable and have good test accuracies. We describe in detail seven of the rules produced.
Availability: Rulesets are available at http://www.aber.ac.uk/compsci/Research/bio/dss/arabpreds/ and predictions are available at http://www.genepredictions.org
Contact:afc@aber.ac.uk},
keywords = {bioinformatics, data mining, inductive logic programming, machine learning, relational learning, scientific knowledge},
pubstate = {published},
tppubtype = {article}
}
Results: We predicted the functional class of a high percentage of the Arabidopsis genes with currently unknown function. These predictions are interpretable and have good test accuracies. We describe in detail seven of the rules produced.
Availability: Rulesets are available at http://www.aber.ac.uk/compsci/Research/bio/dss/arabpreds/ and predictions are available at http://www.genepredictions.org
Contact:afc@aber.ac.uk
2002
Karwath, Andreas; King, Ross D.
Homology Induction: the use of machine learning to improve sequence similarity searches Journal Article
In: BMC Bioinformatics, vol. 3, no. 1, 2002.
Abstract | Links | BibTeX | Tags: bioinformatics, data mining, inductive logic programming, machine learning, relational learning
@article{karwath02a,
title = {Homology Induction: the use of machine learning to improve sequence similarity searches},
author = {Andreas Karwath and Ross D. King},
url = {http://bmcbioinformatics.biomedcentral.com/articles/10.1186/1471-2105-3-11},
doi = {10.1186/1471-2105-3-11},
year = {2002},
date = {2002-04-23},
journal = {BMC Bioinformatics},
volume = {3},
number = {1},
abstract = {Background
The inference of homology between proteins is a key problem in molecular biology The current best approaches only identify ~50% of homologies (with a false positive rate set at 1/1000).
Results
We present Homology Induction (HI), a new approach to inferring homology. HI uses machine learning to bootstrap from standard sequence similarity search methods. First a standard method is run, then HI learns rules which are true for sequences of high similarity to the target (assumed homologues) and not true for general sequences, these rules are then used to discriminate sequences in the twilight zone. To learn the rules HI describes the sequences in a novel way based on a bioinformatic knowledge base, and the machine learning method of inductive logic programming. To evaluate HI we used the PDB40D benchmark which lists sequences of known homology but low sequence similarity. We compared the HI methodoly with PSI-BLAST alone and found HI performed significantly better. In addition, Receiver Operating Characteristic (ROC) curve analysis showed that these improvements were robust for all reasonable error costs. The predictive homology rules learnt by HI by can be interpreted biologically to provide insight into conserved features of homologous protein families.
Conclusions
HI is a new technique for the detection of remote protein homolgy – a central bioinformatic problem. HI with PSI-BLAST is shown to outperform PSI-BLAST for all error costs. It is expect that similar improvements would be obtained using HI with any sequence similarity method.
},
keywords = {bioinformatics, data mining, inductive logic programming, machine learning, relational learning},
pubstate = {published},
tppubtype = {article}
}
The inference of homology between proteins is a key problem in molecular biology The current best approaches only identify ~50% of homologies (with a false positive rate set at 1/1000).
Results
We present Homology Induction (HI), a new approach to inferring homology. HI uses machine learning to bootstrap from standard sequence similarity search methods. First a standard method is run, then HI learns rules which are true for sequences of high similarity to the target (assumed homologues) and not true for general sequences, these rules are then used to discriminate sequences in the twilight zone. To learn the rules HI describes the sequences in a novel way based on a bioinformatic knowledge base, and the machine learning method of inductive logic programming. To evaluate HI we used the PDB40D benchmark which lists sequences of known homology but low sequence similarity. We compared the HI methodoly with PSI-BLAST alone and found HI performed significantly better. In addition, Receiver Operating Characteristic (ROC) curve analysis showed that these improvements were robust for all reasonable error costs. The predictive homology rules learnt by HI by can be interpreted biologically to provide insight into conserved features of homologous protein families.
Conclusions
HI is a new technique for the detection of remote protein homolgy – a central bioinformatic problem. HI with PSI-BLAST is shown to outperform PSI-BLAST for all error costs. It is expect that similar improvements would be obtained using HI with any sequence similarity method.
Karwath, Andreas
Large Logical Đatabases and their Applications to Molecular Biology PhD Thesis
University of Wales, Aberystwyth, 2002.
BibTeX | Tags: bioinformatics, data mining, inductive logic programming, machine learning, relational learning, scientific knowledge
@phdthesis{karwath02b,
title = {Large Logical Đatabases and their Applications to Molecular Biology},
author = {Andreas Karwath},
year = {2002},
date = {2002-01-01},
school = {University of Wales, Aberystwyth},
keywords = {bioinformatics, data mining, inductive logic programming, machine learning, relational learning, scientific knowledge},
pubstate = {published},
tppubtype = {phdthesis}
}
2001
Karwath, Andreas; King, Ross D.
An automated ILP server in the field of bioinformatics Conference
The Eleventh International Conference on Inductive Logic Programming, ILP 2001, vol. 2157, Lecture Notes in Computer Science Springer-Verlag Berlin Heidelberg Springer Verlag, Berlin Heidelberg, Germany, 2001, ISBN: 978-3-540-42538-0.
Abstract | Links | BibTeX | Tags: bioinformatics, data mining, inductive logic programming, machine learning, relational learning
@conference{Karwath2001,
title = {An automated ILP server in the field of bioinformatics},
author = {Andreas Karwath and Ross D. King},
editor = {Raghu Ramakrishnan and Michele Sebag},
url = {http://link.springer.com/chapter/10.1007%2F3-540-44797-0_8},
doi = {10.1007/3-540-44797-0_8},
isbn = {978-3-540-42538-0},
year = {2001},
date = {2001-09-09},
booktitle = {The Eleventh International Conference on Inductive Logic Programming, ILP 2001},
volume = {2157},
pages = {91-103},
publisher = {Springer Verlag},
address = {Berlin Heidelberg, Germany},
organization = {Springer-Verlag Berlin Heidelberg},
series = {Lecture Notes in Computer Science},
abstract = {The identification of evolutionary related (homologous) proteins is a key problem in molecular biology. Here we present a inductive logic programming based method, Homology Induction (HI), which acts as a filter for existing sequence similarity searches to improve their performance in the detection of remote protein homologies. HI performs a PSI-BLAST search to generate positive, negative, and uncertain examples, and collects descriptions of these examples. It then learns rules to discriminate the positive and negative examples. The rules are used to filter the uncertain examples in the “twilight zone”. HI uses a multitable database of 51,430,710 pre-fabricated facts from a variety of biological sources, and the inductive logic programming system Aleph to induce rules. Hi was tested on an independent set of protein sequences with equal or less than 40 per cent sequence similarity (PDB40D). ROC analysis is performed showing that HI can significantly improve existing similarity searches. The method is automated and can be used via a web/mail interface.},
keywords = {bioinformatics, data mining, inductive logic programming, machine learning, relational learning},
pubstate = {published},
tppubtype = {conference}
}
King, Ross D.; Karwath, Andreas; Clare, Amanda; Dehaspe, Luc
The utility of different representations of protein sequence for predicting functional class Journal Article
In: Bioinformatics, vol. 17, no. 5, pp. 445-454, 2001.
Abstract | Links | BibTeX | Tags: bioinformatics, data mining, inductive logic programming, relational learning, scientific knowledge
@article{King2001a,
title = {The utility of different representations of protein sequence for predicting functional class},
author = {Ross D. King and Andreas Karwath and Amanda Clare and Luc Dehaspe},
url = {https://bioinformatics.oxfordjournals.org/content/17/5/445},
doi = {10.1093/bioinformatics/17.5.445},
year = {2001},
date = {2001-01-19},
journal = {Bioinformatics},
volume = {17},
number = {5},
pages = {445-454},
abstract = {Motivation: Data Mining Prediction (DMP) is a novel approach to predicting protein functional class from sequence. DMP works even in the absence of a homologous protein of known function. We investigate the utility of different ways of representing protein sequence in DMP (residue frequencies, phylogeny, predicted structure) using the Escherichia coli genome as a model.
Results: Using the different representations DMP learnt prediction rules that were more accurate than default at every level of function using every type of representation. The most effective way to represent sequence was using phylogeny (75% accuracy and 13% coverage of unassigned ORFs at the most general level of function: 69% accuracy and 7% coverage at the most detailed). We tested different methods for combining predictions from the different types of representation. These improved both the accuracy and coverage of predictions, e.g. 40% of all unassigned ORFs could be predicted at an estimated accuracy of 60% and 5% of unassigned ORFs could be predicted at an estimated accuracy of 86%.
Availability: The rules and data are freely available. Warmr is free to academics.
Contact: rdk@aber.ac.uk},
keywords = {bioinformatics, data mining, inductive logic programming, relational learning, scientific knowledge},
pubstate = {published},
tppubtype = {article}
}
Results: Using the different representations DMP learnt prediction rules that were more accurate than default at every level of function using every type of representation. The most effective way to represent sequence was using phylogeny (75% accuracy and 13% coverage of unassigned ORFs at the most general level of function: 69% accuracy and 7% coverage at the most detailed). We tested different methods for combining predictions from the different types of representation. These improved both the accuracy and coverage of predictions, e.g. 40% of all unassigned ORFs could be predicted at an estimated accuracy of 60% and 5% of unassigned ORFs could be predicted at an estimated accuracy of 86%.
Availability: The rules and data are freely available. Warmr is free to academics.
Contact: rdk@aber.ac.uk
2000
King, Ross D.; Karwath, Andreas; Clare, Amanda; Dehaspe, Luc
Accurate prediction of protein functional class from sequence in the Mycobacterium tuberculosis and Escherichia coli genomes using data mining. Journal Article
In: Yeast (Comparative and Functional Genomics), vol. 17, pp. 283-293, 2000.
Abstract | Links | BibTeX | Tags: bioinformatics, data mining, inductive logic programming, relational learning, scientific knowledge
@article{king00a,
title = {Accurate prediction of protein functional class from sequence in the Mycobacterium tuberculosis and Escherichia coli genomes using data mining.},
author = {Ross D. King and Andreas Karwath and Amanda Clare and Luc Dehaspe},
url = {http://onlinelibrary.wiley.com/doi/10.1002/1097-0061(200012)17:4%3C283::AID-YEA52%3E3.0.CO;2-F/abstract},
doi = {10.1002/1097-0061(200012)17:4<283::AID-YEA52>3.0.CO;2-F},
year = {2000},
date = {2000-12-08},
journal = {Yeast (Comparative and Functional Genomics)},
volume = {17},
pages = {283-293},
abstract = {The analysis of genomics data needs to become as automated as its generation. Here we present a novel data-mining approach to predicting protein functional class from sequence. This method is based on a combination of inductive logic programming clustering and rule learning. We demonstrate the effectiveness of this approach on the M. tuberculosis and E. coli genomes, and identify biologically interpretable rules which predict protein functional class from information only available from the sequence. These rules predict 65% of the ORFs with no assigned function in M. tuberculosis and 24% of those in E. coli, with an estimated accuracy of 60–80% (depending on the level of functional assignment). The rules are founded on a combination of detection of remote homology, convergent evolution and horizontal gene transfer. We identify rules that predict protein functional class even in the absence of detectable sequence or structural homology. These rules give insight into the evolutionary history of M. tuberculosis and E. coli. },
keywords = {bioinformatics, data mining, inductive logic programming, relational learning, scientific knowledge},
pubstate = {published},
tppubtype = {article}
}
King, Ross D.; Karwath, Andreas; Clare, Amanda; Dehaspe, Luc
Genome scale prediction of protein functional class from sequence using data mining Conference
The Sixth ACM SIGKDD International Conference on Knowledge Discovery and Data Mining, KDD 2000, The Association for Computing Machinery, New York, USA, 2000.
Links | BibTeX | Tags: bioinformatics, data mining, inductive logic programming, relational learning
@conference{King2000,
title = {Genome scale prediction of protein functional class from sequence using data mining},
author = {Ross D. King and Andreas Karwath and Amanda Clare and Luc Dehaspe},
editor = {Raghu Ramakrishnan and S. Stolfo and R. Bayardo and I. Parsa},
url = {http://doi.acm.org/10.1145/347090.347172},
doi = {10.1145/347090.347172},
year = {2000},
date = {2000-01-01},
booktitle = {The Sixth ACM SIGKDD International Conference on Knowledge Discovery and Data Mining, KDD 2000},
pages = {384-389},
publisher = {The Association for Computing Machinery, New York, USA},
keywords = {bioinformatics, data mining, inductive logic programming, relational learning},
pubstate = {published},
tppubtype = {conference}
}