Jeanas_stupid_r_code
Stupid R Code, J. Drake 20260401
DF = datafile/filename Can load file from Import Dataset dropdown in RStudio. Make sure to set number columns to numerical. Setting one such columns sets all the other number columns Exported data will be in Finder->Jeana’s MacBook Pro->Macintosh HD->Users->Jeana Drake If receive “incomplete final line on Filename(.fasta), go to final line of fasta file and hit Enter and re-save.
Find out working directory
getwd()
Increase number of lines in output
max.print=# #can set at 100000 to be safe
End “+” prompt
Control + C
Creating a data frame
my_dataframe <- data.frame(Name = c(“Name1”, “Name2”), Age = c(1,2)) #Creating a data table uses the same commands except must open library(data.table) first and use <- data.table command
Extracting a subset of data from a table
library(data.table) SubsetDF <- setDT(LargeDF) [ColumnName %chin% SmallDF$ColumnName) library(xlsx) write.xlsx(SubsetDF, file = “SubsetDF.xlsx”)
Extracting a subset of sequences
library(seqinr) #make sure SequenceFile.fasta is in wd #load DF with list of sequences IDs in subset. Make sure the sequence ID does NOT have > NewFastaName <- read.fasta(file = “SequenceFile.fasta”, seqtype = c(“AA”), as.string = FALSE, seqonly = FALSE, strip.desc = FALSE) NewFastaName2 <- NewFastaName[names(NewFastaName) %in% DF$ColumnName] write.fasta(sequences = NewFastaName2, names = names(NewFastaName2), file.out = “NewFastaName2.fasta”, open = “w”, nbchar = 60, as.string = FALSE)
Find Poly-AA Stretches
library(Biostrings) library(strings) #Redirect console output to file in home directory (see commands below) NewFastaName <- readAAStringSet(“Filename.fasta”) CharacterVectorName <- as.character(NewFastaName) Fasta_Matches <- str_extract_all(CharacterVectorName, “[AApattern]{MinimumCount,}”) #Pattern is any amino acids of interest (e.g., [DE]) #Count for pattern is range contained in curly brackets, {#,} gives lower bounds, {#1,#2} gives lower and upper bounds, {,#} gives upper bounds names(Fasta_Matches) <- names(NewFastaNames) print(Fasta_Matches) #stop redirecting output to file
Redirect All Console Output to File
sink(file = “Filename.txt”) #Run R commands, end with print command print(OutputOfLastCommand) #Check output file sink() #Stops redirecting output and returns to the console
Merging files
merge(DF1, DF2, by.x = c(“ColumnNameDF1”), by.y = c(“ColumnNameDF2”), all = TRUE) #works even if there are different numbers of rows MergedDF <- merge(DF1, DF2, by.x = c(“ColumnNameDF1”), by.y = c(“ColumnNameDF2”), all = TRUE) #Makes a new table that can be exported. The first command just prints the output in the terminal.
AA Composition
library(xlsx) library(“CHNOSZ”) SubsetDF <- read.fasta(“filename.fasta”, iseq = NULL, ret = “count”, lines = NULL, ihead = NULL, start=NULL, stop=NULL, type=”protein”, id = NULL) #OR SubsetDF <- read.fasta(“filename.fasta”, seqtype = c(“AA”), as.string = FALSE, seqonly = FALSE, strip.desc = FALSE) write.xlsx(SubsetDF, file = “SubsetDF.xlsx”) #If R gives “line # did not have # elements” error, use BioStrings to accurately read FASTA file library(Biostrings) DF_fasta_data <- readAAStringSet(“filename.fasta”) #CHNOSZ does not seem to work with Biostrings-imported FASTA files DF_aa_counts <- letterFrequency(DF_fasta_data, letters = AA_ALPHABET) write.xlsx(DF_aa_counts, file = “DF_aa_counts.xlsx”) #If “WARNING: An illegal reflective access operation has occurred” error is shown, ignore it and check to see if the file has been exported to the Home Directory anyway
Isoelectric point (pI)
library(seqinr) library(Biostrings) library(xlsx) library(Peptides) DF_fasta_proteins <- read.fasta(file = “filename.fasta”, seqtype = “AA”, as.string = TRUE) #requires an empty row at the bottom of the fasta file DF_seq <- unlist(DF_fasta_proteins) DF_pi_values <- sapply(DF_seq, pI) DF_pI_export <- data.frame(Proteins = names(DF_pi_values), pI = DF_pi_values) write.xlsx(DF_pI_export, file = “DF_pI.xlsx”)
Translate DNA to protein
library(seqinr) translate(SEQ, frame = 0, sens = “F”, numcode = 1, NAstring = “X”, ambiguous = FALSE) #frames are 0, 1, or 2; sens F is forward, R would be reverse; numcode 1 is standard genetic code; NAstring X translates aa’s for ambiguous bases as X; ambiguous FALSE doesn’t take ambiguous bases into account #stop codons are printed as ‘*’ #list of numcodes (for mitochondrial, etc) are at the translate can page #the above translates one sequence, see below to translate from fasta file in home drive FASTA <- read.fasta(file = “SequenceFile.fasta”, seqtype = c(“DNA”), as.string = FALSE, seqonly = FALSE, strip.desc = FALSE) TRANSLATED_FASTA <- translate(seq = FASTA, etc) #Might require a carriage return after last line
Export table larger than xlsx limit (1,048,576 rows)
write.table(DF, file = “DF”, sep = “\t”, rownames = FALSE)
Export table when RStudio can’t install xlsx package
write.table(DF, file = “DF.txt”)
Boxplot
boxplot(NumericalColumn~FactorColumn, data = DF, cex.axis = #, cex.lab = #, ylim = c(#,#), ylab = “Text”, horizontal = TRUE, col = #) #cex.axis, cex.lab, cex.main is an expansion factor; >1 maxes larger, <1 makes smaller #ylim (or xlim) changes axis limit; can be reversed by switching values (xlim = c(##, #)) where values are related to axis size, not necessary to values in the spreadsheet (I.e., depths of 10 and 30 m might have c(##, #) values of c(2.5, 0.5)) #ylab (or xlab) changes axis label #horizontal rotates axes; must then rename axis labels #col sets the color according to the codes below in Mean Centered Cluster Plots #For multiple colors, col = c(“#”, “#”, “#”, etc) above #or boxplot(DF$NumericalColumn, DF$FactorColumn) boxplot(DF$Numerical ~ DF$Factor)
Boxplot for outliers
boxplot(DF$NumericalColumn, outcol = “red”, cex=#) #outcol sets the color of outlier circles #cex sets the outlier circle size
Different order to box plot
o = ordered(DF$FactorColumn, levels = c(“Factor1”, “Factor2”, etc)) boxplot(NumeralColumn~o, data = DF)
Variance of all
var(DF$NumeralColumn)
Variance of a subset
var(DF$NumeralColumn[DF$FactorColumn==“factorterm”])
Fligner test
fligner.test(DF$NumeralColumn~DF$FactorColumn) #test for homogeneity of variance
Shapiro test
shapiro.test(DF$NumeralColumn[DF$FactorColumn==“factorterm”]) #test for normal distribution #can add [1:5000] immediately after FactorColumn and continue in groups of 5000 if >5000 rows
One way ANOVA
oneway.test(DF$NumeralColumn~DF$FactorColumn) #independent samples, normal distribution, same variance
Tukey HSD post-hoc for ANOVA
data.lm <- lm(DF$DependentVariable~DF$IndependentVariable, data = DF) data.av <- aov(data.lm) summary(data.av) data.test <- TukeyHSD(data.av) data.test
Kruskal-Wallis test
kruskal.test(DF$NumeralColumn~DF$FactorColumn) #for non-normal distribution or inhomogeneous variance
Dunn’s test post-hoc for Kruskal-Wallis
library(dunn.test) dunn.test(DF$DependentVariable, DF$IndependentVariable, method=“bonferroni”)
Alternatively:
library(FSA) PT = dunnTest(DependentVariable ~ IndependentVariable, data=DF, method=“bonferroni”)
T-test
t.test(NumericalColumn~FactorColumn, data = DF, mv = 0, alt = “two.sided”, conf = 0.95, var.eq = F, paired = F) #mv = 0 is null hypothesis that mean=0.
Wilcoxin test
wilcox.test(DF$NumeralColumn~DF$FactorColumn) #non-parametric t-test
Pairwise Wilcoxin
pairwise.wilcox.test(DF$NumeralColumn, DF$FactorColumn, paired = FALSE, p.adjust.method=“BH”) #can use default p.adjust method
Rosner’s Test for Outliers
library(readr) library(outliers) library(EnvStats) #See box plot for outliers above rosnerTest(DF$NumericalColumn, k=#, warn=TRUE #k=# is number of red outlier circles plus one. Can ballpark if you can’t tell bc there are so many #If k=# value is too low, the same number of outliers as the # will be returned #$all.stats information will list the outliers first and will give the row number on which they fall
P-values
#all pheatmap library except for pheatmap
DF_clean = DF %>% select(“ColumnName”, starts_with(“Column1”, “Column2”)%>% as.data.frame() #ignore select if DF only has the information to be graphed #as.data.frame converts from tibble to dataframe DF_clean2 <- DF_clean[, -1] #removes row name column rownames(DF_clean2) <- DF_clean[, 1] #moves row name column over to dataframe’s rownames pValues <- apply(DF_clean2, 1, function(x) t.test(x[Column#:Column#],x[Column#:Column#])$p.value) #first range of column numbers is 1st group, second range is second group library(xlsx) write.xlsx(as.data.frame(pValues), file = “filename.xlsx”)
Fisher’s Exact Test
MatrixName = rbind(c(x,x,x), c(x,x,x), c(x,x,x), etc); MatrixName fisher.test(MatrixName, workspace = 2000000))
Histogram
hist(df$NumeralColumn)
Pie Chart (base)
Category1 <- c(“Name1”, “Name2”) Category2 <- c(1,2) DataName <- data.frame(Category1, Category2) #Can see data frame with the following command, but not necessary DataName mycolors <- c(“red”, “yellow”) pie(DataName$Category2, labels = Category1, cex = #, main = “Title”, col = mycolors) #cex changes the label font size #base pie command keeps order of Category1 pie(DataName$Category2, labels = NULL, cex = #, main = “Title”, col = mycolors) #Removes label names, replaces them with numbers that can be removed in Illustrator legend(“right”, legend = Category1, fill = mycolors, cex = #) #Add legend. Can use bottomright, bottom, bottomleft, left, or combos with top. #Legend overwrites part of pie chart. Generate pie chart w/ and w/o legend and crop if needed
Pie Chart (ggplot2)
library(ggplot2) #make data frame as above for base pie chart p <- ggplot(DataName, aes(x = “”, y = Category2/value, fill = Category1/category)) + geom_bar(width = 1, stat = “identity”) #ggplot doesn’t have a pie chart so start as a bar chart and the x=“” creates a fake x-axis #fill fills in the slices to assign colors later pie_chart <- p + coord_polar(theta = “y”) #converts the bar chart (Cartesian coordinates) to polar coordinates as a circle pie_chart_enhanced <- pie_chart + theme_void() + labs(title = “Distribution of Categories”) + geom_text(aes(label = category), position = position_stack(vjust = 0.5)) #Not required but enhances the appearance pie_chart_enhanced #Displays pie chart #ggplot2 pie command reorder Category1 alphabetically
Scatterplot
plot(df$x-axis, df$y-axis, col = “color”, pch = ##, xlim = range(c(#,#)), xlab = “Label”) #pch gives scatter as points and value changes the size, with smaller values yielding larger size, max value of 25 points(df$x-axis, df$y-axis, col = color2”, pch = ##) #Adds second set of data on same axes legend(“toplight”, legend = c(“Label1”, “Label2”), col = c(“color1”, “color2”), pch = 3#
Heatmap
library(tidyverse) library(magrittr) library(pheatmap) library(RColorBrewer) library(rio) library(dplyr) DF_clean = DF %>% mutate(newcolumnname = paste0(column1, “:”, column2) %>% select(“newcolumnname”, starts_with(“Numbercolumn1”, etc) %>% as.data.frame() #ignore mutate command if gene ID and annotation are already in the same column or if that information isn’t needed #ignore select if DF only has the information to be graphed #as.data.frame converts from tibble to dataframe DF_clean2 <- DF_clean[, -1] #removes row name column rownames(DF_clean2) <- DF_clean[, 1] #moves row name column over to dataframe’s rownames boxplot(DF_clean2, las = 2) #checks distribution of data #las = 2 makes sample label perpendicular DF_clean2_log10 = log10(DF_clean2) boxplot(DF_clean2_log10, las = 2) #minimizes variance pheatmap(DF_clean2 or DF_clean2_log10), border_color = NA, fontsize_row = #, fontsize_col = #, cluster_rows = F, show_rownames = F) #cluster_rows = F orders rows according to how they are in the dataframe, like if making a heat map to match a ranking graph #show_rownames = F removes rownames if making a giant heatmap of tons of genes #to set the order of the columns but keep their dendrogram, do the following: library(dendextend) phtmap <- pheatmap::pheatmap(DF_clean2) col_dend <- phtmap[[2]] col_dend <- dendextend::rotate(cold_dend, order = c(“Column1”, “Column2”, etc)) pheatmap(DF_clean2, cluster_cols=as.hclust(col_dend), all the other pheatmap stuff) #assigned column order must be possible given the dendrogram or it will give an error or just not change the order.
PCA
library(tidyverse) library(magrittr) library(RColorBrewer) library(rio) library(dplyr) #prcomp is standard in R #Load file #If file is small, can load with samples in rows and genes/proteins in columns. Remove sample name column as all data must be numerical and R won’t move row names in 1st column to row names using pheatmap commands #If file is large (like normalized and transformed deseq output) and can’t be transposed first in excel bc it exceeds the column limit, load as a normal file with samples as columns and genes as rows. Remove first row with sample names before loading DF = as.martrix(DF) #if file has samples as columns and genes as rows, transpose first using the following command. If file loaded already transposed, skip it DF_trans <- t(DF) DF_PCA <- prcomp(DF, center = TRUE, scale = TRUE #center shifts the variables to be zero centered #scale shifts the unit variance before the analysis takes place. Scaling is advisable by the rdocumentation. #if transpose command was used, make sure to do prcomp on DF_trans rather than DF summary(DF_PCA) #Gives SD, % of variance, and cumulative % of variance explained by each PC str(DF_PCA) #gives information about the PCA object itself rownames(DF) <- c(“Sample1”, “Sample2”, etc) #makes position of each sample in PCA plot show up as sample name library(devtools) library(ggbiplot) ggbiplot(DF_PCA, var.axes = FALSE, labels=rownames(DF), labels.size=#) #4 is a good size for row name font #var.axes removes the arrows for each factor placing the samples in the PCA. Each factor is a gene or protein #if xlim or ylim need to be adjusted or to add label or change background color, make the ggbiplot its own object DF_PCA_obj <- ggbiplot(DF_PCA, var.axes = FALSE, labels=rownames(DF), labels.size=#) DF_PCA_obj + coord_cartesian(xlim = c(#,#), ylim = (c(#,#) + theme_bw(), ggtitle(“Title”) + theme(plot.title = element_text(hjust = 0.5)) #theme_bw leaves gray lines from tick points but makes background white and puts solid black lines around the plot #theme(plot.title…) centers the title. The default is to have it left-justified for some stupid reason
nMDS
library(vegan) m_DF = as.matrix(DF) #just values, no row names or other columns DF_nmds = metaMDS(m_DF, distance = “dissimilarityindex”, maxit = 200, sfgrmin = 1e-7) #bray-curtis is default #maxit increases the # of iteration #sfgrmin decreases scale factor of the gradient, add this if getting the error ‘scale factor of the gradient <sfgrmin’ #if nmds doesn’t converge, try the following: DF_nmds2 = metaDMS(m_DF, distance = “dissimilarityindex”, previous.best = DF_nmds, maxit = 200) #if nmds still doesn’t converge, give up and do PCA
Make mean-centered cluster plots
maxplot(DF, type = “1”, pch = 1, col = #, ylab = “labelname”, xlay = “labelname”, xaxis = c(#,#), main = “titlename”) #type is line type assignment, 1=solid line #col is color assignment, 1=black, 2=pink, 3=green, 4=blue, 5=cyan, 6=magenta, 7=yellow, 8=gray, 9=black, 10=dark pink #can also use the HEX color code in “” “” #to make lines different colors, have each group as a separate tab #after maxplot, do the following: maxlines(DF, type = “1”, pch = 1, col = different#thaninmaxplot, lay = #forlinetype, lwd = #forlinewidth)
Transpose data.frame or data.matrix
Transposedf <- t(df)
Make a list for MOFA
Listname=list(title1=as.matrix(df1),title2=as.matrix(df2))
Make comma separated values onto separate lines
library(tidyverse) Sep_DF <- DF %>% tidyr::separate_rows(“Column name”, sep = “,”) library(xlsx) write.csv(Sep_DF, file = “Filename.csv”) #Replace , with whatever separator is used. #Open in excel and save as tab separated text file #For Blast2GO, change suffix to .annot
Signal peptide prediction
install.packages(“signalHsmm”) library(signalHsmm) #Read in fasta file per above, making sure that stop codon ‘*’ is removed from all sequences FASTA_signalP <- run_signalHsmm(FASTA[1:n]) #n = highest number of sequences desired library(xlsx) FASTA_signalP_table <- pred2df(FASTA_signalP) write.xlsx(FASTA_signalP_table, file = “FASTA_signal_table.xlsx”)
Inserting carriage return in text
#This is more for text/BBEdit #For eDNA sequencing core output for SNPs #Open in Excel
=(cell with “>”)&(cell with sequence) #Drop to all sequences #Control-c #Open in BBEdit #Right-click #Paste-values #Control-f Replace > Replace with >control-return #Replace all
Dendrogram
library(ape) DendroName <- read.tree(text = “NEWICK FILE TEXT PASTED IN”) #Ensure that the tree string ends in a semi-colon. #Make sure that brackets are matched. Some text editors, such as RStudio or Notepad++, contain built-in syntax highlighting that will highlight the counterpart to each parenthesis as you type. #Look out for ((double brackets)) or brackets enclosing a single node: the resultant ‘invisible nodes’ can cause R to crash. >plot(DendroName) #write.dendrogram from ape package doesn’t seem to work (could be RStudio version on my very old laptop) plot(DendroName, show.node.label = TRUE) #shows bootstrap values at nodes nodelabels() #labels the nodes with their number tiplabels() #labels the tips with their number #If the values beyond the decimal point should be shortened or changed, must be done manually in the Newick file; the node values to be changed are before the : in the format #.########(node value):#.########(node length) rt.DendroName# <- rotate(DendroName,node#) plot(rt.DendroName# #rotates at a node of Node# #Node #s seem to start at the top of the dendrogram rr.DendroName# <- root(DendroName,node=#) plot(rr.Dendroname#) #reroot at a node of Node#
Superscript & Subscripts
boxplot(NC~FC, data =DF, ylab = expression(paste(“Text“^”Text”*” Text”))) #^ raises the next alphanumeric, * lowers the next alphanumeric back to normal #[] around alphanumeric makes the contents subscripted, * lowers the next alphanumeric back to normal
Greek Letters and other symbols (Mac)
#RStudio allows copy/paste of symbols and emojis
Taxonomizr
library(taxonomizr) prepareDatabase(‘accessionTaxa.sql’) SQLite database accessionTaxa.sql already exists. Delete to regenerate #Do NOT delete the database and associated files [1] “accessionTaxa.sql” taxaYOURSAMPLE<-getId(c(“sps 1”, “sps 1”, “sps 3”, “etc”), ‘accessionTaxa.sql’) #Give whatever name you want for taxaYOURSAMPLE #Each Latin name must be in double quotations and separated by a comma #There can be a space between the names and the commas #Just copying the vertical list of Latin names from the Organism column of the Blast output doesn’t work #RStudio can only handle 4091 characters per line. To get around this for long lists of Latin names, highlight names in the alphabetized list (see Megablast .csv instructions above) up to 4000 characters inclusive of quotes and commas (BBEdit shows character count in the lower right side of the file border) starting with a double quote before a species name and the comma after a species name. Then hit enter. This yields a ‘+’ and starts a new line of the same command. You can then copy in the next batch of species names up to ~4000 characters and continue with species name batches until you run through your entire non-redundant list. On the last line of the command, include the “), ‘accessionTaxa.sql’)” to close out the command. DF<-getTaxonomy(taxaYOURSAMPLE, ‘accessionTaxa.sql’) write.table(DF, file = “DF.txt”) #This command writes a txt file to the home directory containing the taxonomy assignments for all species across all lines of the command above that are separated by ‘+’
Multiple Sequence Alignment
if (!require(“BiocManager”, quietly = TRUE))
- install.packages(“BiocManager”) BiocManager::install(“msa”) library(msa) #Make sure .fasta file is in Users directory msa(“Filename.fasta”, method = c(“ClustalOmega”), type = “protein” #Output to cluster format is not yet resolved.
Mapping Peptides to Proteins
library(protti) library(devtools) library(tidyverse) library(dplyr) library(magrittr) #Open .xlsx file of Accession# + Peptide #Open .xlsx file of Accession# + Protein Merged_ProtPep <- merge(DF1, DF2, by.x = c(“DF1AccessColumn”), by.y = c(“DF2AccessColumn”), all = TRUE) write.table(Merged_ProtPep, file = “Merged_ProtPep.txt”) #Open Merged_ProtPep ProtPepDF <- as.data.frame(Merged_ProtPep) PepMap <- find_peptide(data = ProtPepDF,protein_sequence = ProteinColumn,peptide_sequence = PeptideColumn) write.table(PepMap, file = “MappedPeptides.txt”)